In [2]:
# Importing python built in modules

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report

In [3]:
# Try reading the CSV file with a different encoding
try:
    data = pd.read_csv('spam.csv', encoding='utf-8')
except UnicodeDecodeError:
    # If UTF-8 fails, try a different encoding
    data = pd.read_csv('spam.csv', encoding='latin1')


In [4]:
#Viewing the full data and try to understand it
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [5]:
# Seeing all null and non-null values and data type of every column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
#Keeping only the relevant column which will useful in building the models
data=data[['v1','v2']]

In [7]:
# renaming the column for better understanding
data.columns=['label','text']

In [8]:
#Split the data into training and testing sets
X=data['text']
Y=data['label']
#taking 20% data for testing and 80% for training 
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [13]:
# using TfidfVectorizer to convert test data into numerical format using TF-IDF
# using stop_words to 'English' to remove common english words
tfidf_vectorizer=TfidfVectorizer(stop_words='english')

In [15]:
# Fitting vectorizer to training data
X_train_tfidf=tfidf_vectorizer.fit_transform(X_train)

In [16]:
# Transforming test data to TF-IDF
X_test_tfidf=tfidf_vectorizer.transform(X_test)

In [17]:
# choosing Multinomialbayes for classification as it is suitable for test classifiaction tasks
clf=MultinomialNB()

In [18]:
#training the model
clf.fit(X_train_tfidf,Y_train)

In [20]:
# Making Prediction
predictions=clf.predict(X_test_tfidf)

In [21]:
# Evaluating a model
accuracy=accuracy_score(Y_test,predictions)
report=classification_report(Y_test,predictions)

In [23]:
#Printing model accuracy and report
print(f"Accuracy:{accuracy}")
print(f"Classification report:\n{report}")

Accuracy:0.9668161434977578
Classification report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



# Conclusion

- model has an overall accuracy of 96.68%, indicating it correctly predicts whether an SMS is spam or legitimate for the majority of cases.
- or legitimate (ham) messages, the model performs exceptionally well with 96% precision and 100% recall.
- For spam messages, the model has perfect precision (100%) but lower recall (75%), suggesting it misses some spam messages.
- The weighted average F1-score is 96%, demonstrating good overall performance.