# Task4 - SPAM SMS

In [1]:
"""
Build an AI model that can classify SMS messages as spam or legitimate. Use techniques like TF-IDF or word embeddings with 
classifiers like Naive Bayes, Logistic Regression, or Support Vector Machines to identify spam messages
"""

'\nBuild an AI model that can classify SMS messages as spam or legitimate. Use techniques like TF-IDF or word embeddings with \nclassifiers like Naive Bayes, Logistic Regression, or Support Vector Machines to identify spam messages\n'

# 1. Load,Read or Extract data

In [2]:
import pandas as pd                                         # importing pandas package for handling dataframe
data = pd.read_csv("spam.csv", encoding='ISO-8859-1')       # Read data from file with ISO encoding
data.rename(columns={"v1": "Status", 
                     "v2": "Text1", 
                     "Unnamed: 2":"Text2", 
                     "Unnamed: 3":"Text3", 
                     "Unnamed: 4":"Text4"}, inplace=True)   # After reading data rename the column names
data                                                        # Display the data


Unnamed: 0,Status,Text1,Text2,Text3,Text4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# 2. EDA

In [3]:
data.isnull().sum()             # Check for null values in data

Status       0
Text1        0
Text2     5522
Text3     5560
Text4     5566
dtype: int64

In [4]:
# Fill null values with " " and Combine all columns except Status column to new column named "Text"
data['Text']= data['Text1'].fillna(" ").astype(str)+" "+data['Text2'].fillna(" ").astype(str)+" "+data['Text3'].fillna(" ").astype(str)+" "+data['Text4'].fillna(" ").astype(str)
data = data.drop(columns=['Text1', 'Text2', 'Text3','Text4'])   # After combining to "Text" drop columns :  'Text1', 'Text2', 'Text3','Text4'
data                                                            # Display the data

Unnamed: 0,Status,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
data.isnull().sum()     # Check for null values in data again now nno null values are present

Status    0
Text      0
dtype: int64

In [6]:
data.info()             # To get data information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Status  5572 non-null   object
 1   Text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
data.describe()     # To get data description

Unnamed: 0,Status,Text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


# 3. Feature Extraction

In [8]:
from sklearn.preprocessing import LabelEncoder      # Importing LabelEncoder module from sklearn package
le=LabelEncoder()                                   # Initializing LabelEncoder()
data["Status"]=le.fit_transform(data["Status"])     # Changing values to numerical labels for "Status" column
data                                                # Display the data

Unnamed: 0,Status,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [9]:
x = data['Text']            # To save Independent variable to variable 'x'
y = data['Status']          # To save Dependent variable to variable 'y'

# 4. Model Building - Fit the TF-IDF vectorizer

In [10]:
from sklearn.model_selection import train_test_split                                        # Importing train_test_split from sklearn package
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)   # Split the data such that 20% of data for testing and remaining for training
print(x_train.shape)    # Check the shape of x_train dataset
print(y_train.shape)    # Check the shape of y_train dataset
print(x_test.shape)     # Check the shape of x_test dataset
print(y_test.shape)     # Check the shape of y_test dataset

(4457,)
(4457,)
(1115,)
(1115,)


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer             # Importing TfidfVectorizer package from sklearn
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)   # Initialize TfidfVectorizer with 5k features and remove english stopwords (like "the", "is", "in", etc.
X_train_tfidf = vectorizer.fit_transform(x_train)                       # Apply fit_transform on x_train and initialize to a variable using vectorizer
X_test_tfidf = vectorizer.transform(x_test)                             # Apply transform on x_test and initialize to a variable using vectorizer

# 5. Model building and evaluation - MultinomialNB

In [12]:
from sklearn.naive_bayes import MultinomialNB       # Importing MultinomialNB module from sklearn
nb_model = MultinomialNB()                          # Initializing MultinomialNB from Multinomial Navie Bayes
nb_model.fit(X_train_tfidf, y_train)                # Fit then MultinomialNB model with training dataset
y_pred_nb = nb_model.predict(X_test_tfidf)          # To predict the values for testing dataset and save/initialize to a variable

In [13]:
from sklearn.metrics import classification_report, accuracy_score   # Importing metrics modules from sklearn for classification_report, accuracy_score
print("Naive Bayes Classifier:")                                    # Evaluating Multinomial Naive Bayes models
print(classification_report(y_test, y_pred_nb))                     # To get classification report
print("Accuracy:", accuracy_score(y_test, y_pred_nb))               # To get accuracy score

Naive Bayes Classifier:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.80      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy: 0.9730941704035875


# 6. Model build and evaluation- LogisticRegression

In [14]:
from sklearn.linear_model import LogisticRegression     # Importing Logistic Regression module from sklearn
lr_model = LogisticRegression()                         # Initialize Logistic Regresion 
lr_model.fit(X_train_tfidf, y_train)                    # Train the Logistic Regresion model with training data set
y_pred_lr = lr_model.predict(X_test_tfidf)              # Predict the testing dataset

In [15]:
from sklearn.metrics import classification_report, accuracy_score   # Importing metrics modules from sklearn for classification_report, accuracy_score
print("\nLogistic Regression Classifier:")
print(classification_report(y_test, y_pred_lr))                     # To get classification report for Logistc Regression
print("Accuracy:", accuracy_score(y_test, y_pred_lr))               # To get accuracy score for Logistc Regression


Logistic Regression Classifier:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       965
           1       0.97      0.69      0.81       150

    accuracy                           0.96      1115
   macro avg       0.96      0.85      0.89      1115
weighted avg       0.96      0.96      0.95      1115

Accuracy: 0.9560538116591928


# 7. Model Building and evaluation - SVC

In [16]:
from sklearn.svm import SVC                     # Importing SVC module from sklearn
svm_model = SVC()                               # Initializing SVC model
svm_model.fit(X_train_tfidf, y_train)           # Fit the SVC model with training dataset
y_pred_svm = svm_model.predict(X_test_tfidf)    # To prdict the SVM model with trsting dataset

In [17]:
from sklearn.metrics import classification_report, accuracy_score   # Importing metrics modules from sklearn for classification_report, accuracy_score
print("\nSupport Vector Machine Classifier:")
print(classification_report(y_test, y_pred_svm))                    # To get classification report for SVC
print("Accuracy:", accuracy_score(y_test, y_pred_svm))              # To get accuracy score for SVC


Support Vector Machine Classifier:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       0.99      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.9766816143497757


# Model Testing

In [18]:
# Messages to be tested on 
new_messages = [
    "Congratulations! You've won a free vacation to the Bahamas. Call now to claim your prize!",
    "Hey, are we still on for the meeting tomorrow?",
    "You've been selected for a $1000 shopping spree! Click here to claim your gift card!",
    "Can We meet tomorrow ?"
]
new_messages_tfidf = vectorizer.transform(new_messages)     # Transform the new messages to match the training data format

predictions_nb = nb_model.predict(new_messages_tfidf)       # Predict using the Naive Bayes model
predictions_lr = lr_model.predict(new_messages_tfidf)       # Predict using the Logistic Regression model
predictions_svm = svm_model.predict(new_messages_tfidf)     # Predict using the Support Vector Machine model

label_map = {0: 'ham', 1: 'spam'}                           # Convert numerical predictions back to labels (0 = ham, 1 = spam)
for i, message in enumerate(new_messages):                  # Print predictions for each model
    print(f"Message: {message}")
    print(f"Status :\tNaive Bayes Prediction         : {label_map[predictions_nb[i]]}")
    print(f"\t\tLogistic Regression Prediction : {label_map[predictions_lr[i]]}")
    print(f"\t\tSVM Prediction                 : {label_map[predictions_svm[i]]}")
    print("=" * 100)


Message: Congratulations! You've won a free vacation to the Bahamas. Call now to claim your prize!
Status :	Naive Bayes Prediction         : spam
		Logistic Regression Prediction : spam
		SVM Prediction                 : spam
Message: Hey, are we still on for the meeting tomorrow?
Status :	Naive Bayes Prediction         : ham
		Logistic Regression Prediction : ham
		SVM Prediction                 : ham
Message: You've been selected for a $1000 shopping spree! Click here to claim your gift card!
Status :	Naive Bayes Prediction         : spam
		Logistic Regression Prediction : spam
		SVM Prediction                 : spam
Message: Can We meet tomorrow ?
Status :	Naive Bayes Prediction         : ham
		Logistic Regression Prediction : ham
		SVM Prediction                 : ham
