# Lab 3

## Step 1 - Load Data

In [3]:
import numpy as np
import pandas as pd

df = pd.read_csv('spam.csv', encoding='latin-1') # specifying encoding is necessary because the data does not use UTF-8

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Step 2 - Preprocessing

### Step 2a - Drop Columns

In [4]:
# Drop the last 3 columns using iloc
df = df.drop(df.iloc[:,2:], axis=1)

# Check the data
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Step 2b - Inspect Data

In [5]:
# Check the number of instances per class
# Rename columns to match expected names, then show class counts
df = df.rename(columns={'v1': 'Labels', 'v2': 'Message'})
print(df['Labels'].value_counts())
print('\n')

# Check for completeness of data
print(df.info())
print('\n')

# Check descriptive statistics
print(df.describe())

Labels
ham     4825
spam     747
Name: count, dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Labels   5572 non-null   object
 1   Message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


       Labels                 Message
count    5572                    5572
unique      2                    5169
top       ham  Sorry, I'll call later
freq     4825                      30


### Step 2c - Label Encoding

In [6]:
# Data for labels
new_labels = {
    'spam': 1,
    'ham': 0
}

# Encode labels
df['Labels'] = df['Labels'].map(new_labels)

# Check the data
df.head()

Unnamed: 0,Labels,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Step 2d - Separate Features and Labels

In [7]:
# Use the actual dataframe column name 'Message' (there is no 'SMS' column)
X = df['Message'].values
y = df['Labels'].values

## Step 3 - Feature Extraction

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Initialize CountVectorizer
bow = CountVectorizer()

# Fit and transform X_train with CountVectorizer
X_train = bow.fit_transform(X_train)

# Transform X_test
# Why only transform? The same reason as in experiment 3
# We do not want the model to know the parameters used by CountVectorizer to fit X_train
# Thus, the testing data can remain unfamiliar to the model later
X_test = bow.transform(X_test)

In [9]:
print(len(bow.get_feature_names_out()))
print(f'Data dimensions: {X_train.shape}')

7727
Data dimensions: (4457, 7727)


## Step 4 - Model Training and Evaluation

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Initialize MultinomialNB
mnb = MultinomialNB()

# Fit the model
mnb.fit(X_train, y_train)

# Predict on the training data
y_pred_train = mnb.predict(X_train)

# Evaluate training accuracy
acc_train = accuracy_score(y_train, y_pred_train)

# Predict on the test data
y_pred_test = mnb.predict(X_test)

# Evaluate test accuracy
acc_test = accuracy_score(y_test, y_pred_test)

# Print evaluation results
print(f'Training data accuracy: {acc_train}')
print(f'Test data accuracy: {acc_test}')

Training data accuracy: 0.9946152120260264
Test data accuracy: 0.9775784753363229
