# Workshop 3 (Pataranan)

## Import library

In [44]:
from nltk.corpus import stopwords
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import nltk
# Download the stopwords resource
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bampatra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Clean data

In [45]:
# Load the initial dataset
file_path = 'resource/spam.csv'
data = pd.read_csv(file_path, encoding='latin1')

def clean_text(text):
    if pd.isna(text):  # Check for NaN values
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    return ' '.join(word for word in text.split() if word not in stop_words)

data['cleaned_text'] = data['v2'].astype(str).apply(clean_text)

# Drop unnecessary columns
data = data.drop(columns=['v2','Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

# Save the cleaned data
output_file_path = 'cleaned_spam_dataset.csv'
data.to_csv(output_file_path, index=False)

### Calculate word length and added with column name “length”

In [46]:
# Reload the cleaned dataset
cleaned_data = pd.read_csv(output_file_path)

# Ensure 'cleaned_text' column contains strings and handle missing values
cleaned_data['cleaned_text'] = cleaned_data['cleaned_text'].fillna('').astype(str)
cleaned_data['v1'] = cleaned_data['v1'].fillna('unknown')

# Calculate word length for each row in the 'cleaned_text' column and add a new column 'length'
cleaned_data['length'] = cleaned_data['cleaned_text'].apply(lambda x: len(x.split()))

# Save the updated dataset
length_file_path = 'cleaned_spam_dataset_with_length.csv'
cleaned_data.to_csv(length_file_path, index=False)

### Use labelEncoder method to convert class target

In [47]:
# Use LabelEncoder to convert the class target
label_encoder = LabelEncoder()
cleaned_data['label'] = label_encoder.fit_transform(cleaned_data['v1'])

# Save the final dataset with label encoding
label_file_path = 'cleaned_spam_dataset_with_labels.csv'
cleaned_data.to_csv(label_file_path, index=False)

### Use CountVectorize to perform BOW

In [48]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Apply CountVectorizer to the 'cleaned_text' column
bow_matrix = vectorizer.fit_transform(cleaned_data['cleaned_text'])

# Get the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Convert the sparse matrix to a DataFrame for better readability
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=feature_names)

# Save the Bag-of-Words representation to a CSV file
bow_file_path = 'bag_of_words_representation.csv'
bow_df.to_csv(bow_file_path, index=False)

### List Top 5 and bottom 5 of transform sample to show the results

In [49]:
# Extract top 5 and bottom 5 transformed samples based on row indices
top_5 = bow_df.head(5)
bottom_5 = bow_df.tail(5)

# Save the results to a text file
top_bottom_file_path = 'top_bottom_samples.txt'
with open(top_bottom_file_path, 'w') as file:
    file.write("Top 5 Samples:\n")
    file.write(top_5.to_string())
    file.write("\n\nBottom 5 Samples:\n")
    file.write(bottom_5.to_string())

In [50]:
(top_bottom_file_path, bow_file_path, length_file_path, label_file_path)

('top_bottom_samples.txt',
 'bag_of_words_representation.csv',
 'cleaned_spam_dataset_with_length.csv',
 'cleaned_spam_dataset_with_labels.csv')