# SMS Classifier

In [35]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# Download stopwords from nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##### Reading Dataset

In [27]:
# Load the CSV file into a DataFrame
file_path = './spam.csv'
spam_data = pd.read_csv(file_path, encoding='latin-1')

# Display the first few rows of the DataFrame
spam_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


##### Data Cleaning

In [29]:
# Dropping the unnecessary columns 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'
spam_data_cleaned = spam_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Renaming columns for clarity
spam_data_cleaned.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

# Check for any missing values
missing_values = spam_data_cleaned.isnull().sum()


print(missing_values)
spam_data_cleaned.head()


label      0
message    0
dtype: int64


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


##### Adjusted Function for text processing without stopwords removal

In [32]:
def preprocess_text_adjusted(message):
    # Convert to lower case
    message = message.lower()
    # Remove punctuation
    message = re.sub(r'[^\w\s]', '', message)
    # Stemming
    stemmer = PorterStemmer()
    words = message.split()
    words_stemmed = [stemmer.stem(word) for word in words]
    return ' '.join(words_stemmed)

# Apply adjusted text processing to each message
spam_data_cleaned['processed_message'] = spam_data_cleaned['message'].apply(preprocess_text_adjusted)

spam_data_cleaned.head()


Unnamed: 0,label,message,processed_message
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazi avail onli in bugi...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri in 2 a wkli comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so earli hor u c alreadi then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goe to usf he live around ...


Feature Extraction using Bag of Words

In [36]:
# Creating a Bag of Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(spam_data_cleaned['processed_message'])

# Extracting the target variable
y = spam_data_cleaned['label']

# Displaying the shape of the features
X.shape, y.shape


((5572, 8154), (5572,))

In [38]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Building the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


Accuracy

In [39]:
accuracy

0.9784688995215312

Classification

In [41]:
print(classification_rep)

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1453
        spam       0.92      0.91      0.92       219

    accuracy                           0.98      1672
   macro avg       0.95      0.95      0.95      1672
weighted avg       0.98      0.98      0.98      1672



### Classification Report:

 - For 'ham' messages, the model has a precision of 99% and a recall of `99%`.
 - For 'spam' messages, the model has a precision of 92% and a recall of `91%`.
 - The F1-score, which is a balance between precision and recall, is `99% for 'ham' and 92% for 'spam'`.