
Importing the Dependencies

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

The TfidfVectorizer is a powerful tool for converting text data into numerical features.

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
from sklearn.svm import SVC

In [10]:
from sklearn.metrics import accuracy_score

In [11]:
from sklearn.metrics import classification_report

Data Collection and Pre-Processing

In [12]:
#loading the data from csv file to a pandas dataframe
raw_mail_data = pd.read_csv('mail_data.csv')

In [13]:
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name
5572     spam  Click on this link and you will get one millio...
5573     spam   Share this message with 5 more people to get ...
5574     spam  Please provide your ATM pin number.We will upd...

[5575 rows x 2 columns]


In [14]:
#replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [15]:
#printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
#checking the number of rows and columns in the dataframe
mail_data.shape

(5575, 2)

Label Encoding

In [17]:
#label spam mail as 0; and ham mail as 1
mail_data.loc[mail_data['Category'] == 'spam','Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham','Category',] = 1

spam = 0
ham = 1

In [18]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
#seperating the data as texts and label
X=mail_data["Message"]
Y=mail_data["Category"]

In [20]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
5572    Click on this link and you will get one millio...
5573     Share this message with 5 more people to get ...
5574    Please provide your ATM pin number.We will upd...
Name: Message, Length: 5575, dtype: object


In [21]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5570    1
5571    1
5572    0
5573    0
5574    0
Name: Category, Length: 5575, dtype: object


Splitting the data into training data and test data

In [22]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

In [23]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5575,)
(4460,)
(1115,)


Feature Extraction

In [24]:
#transform the text data to feature vectors that can be used as input to the logistic regression 

feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

#convert Y_train and Y_test values as integers

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [25]:
print(X_train_features)

  (0, 5422)	0.3073977761356483
  (0, 3026)	0.2955751918715437
  (0, 7426)	0.3126393634460087
  (0, 4576)	0.2372678669170855
  (0, 6668)	0.37108836063375006
  (0, 3650)	0.4485450741761855
  (0, 6974)	0.35215133450885144
  (0, 4495)	0.4485450741761855
  (1, 3506)	0.24162050557475942
  (1, 3546)	0.2636153024524558
  (1, 4808)	0.2368941231858314
  (1, 6977)	0.27372957939191966
  (1, 6713)	0.17368117955406204
  (1, 4118)	0.26075840212227497
  (1, 2129)	0.16317060601750763
  (1, 401)	0.3201433484135936
  (1, 4044)	0.2636153024524558
  (1, 3418)	0.2470084001252953
  (1, 7366)	0.18989178528413198
  (1, 6958)	0.28747958138889934
  (1, 1856)	0.20339226852771974
  (1, 2690)	0.27372957939191966
  (1, 6425)	0.26670098988034485
  (1, 1149)	0.2777912813009463
  (2, 7405)	0.32048505178341835
  :	:
  (4457, 1667)	0.3562149127625101
  (4457, 5440)	0.3562149127625101
  (4457, 2387)	0.3562149127625101
  (4457, 1505)	0.3562149127625101
  (4457, 1365)	0.3562149127625101
  (4458, 6713)	0.1636793265632707
  (

Training Model

Logistic Regression

In [26]:
lg_model=LogisticRegression()

In [27]:
#training the Logistic Regression model with the training data
lg_model.fit(X_train_features,Y_train)

Evaluating the trained model

In [28]:
#prediction on training data
prediction_on_training_data=lg_model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [29]:
print('Accuracy on training data:',accuracy_on_training_data)

Accuracy on training data: 0.9663677130044843


In [30]:
#prediction on test data
prediction_on_test_data=lg_model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)

In [31]:
print('Accuracy on test data:',accuracy_on_test_data)
#the reason for checking accuracy on both training and test data is to prevent any conditions that lead to overfitting

Accuracy on test data: 0.9650224215246637


In [32]:
print("\nClassification Report:\n",classification_report(Y_test,prediction_on_test_data))


Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.73      0.84       139
           1       0.96      1.00      0.98       976

    accuracy                           0.97      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.97      0.97      0.96      1115



Building a Predictive System

In [33]:
input_mail=["I will not attend the class, today. Please give proxy for me in the class."]
#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making predictions
prediction = lg_model.predict(input_data_features)
print(prediction)

[1]


In [34]:
if prediction[0]==1:
    print('Ham mail')
else:
    print('Spam mail')

Ham mail


Random Forest

In [35]:
rf_model=RandomForestClassifier(n_estimators=100)

In [36]:
#training the random forest classifier with training data
rf_model.fit(X_train_features,Y_train)

Evaluating the trained model

In [37]:
#prediction on training data
prediction_on_training_data=rf_model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [38]:
print('Accuracy on training data:',accuracy_on_training_data)

Accuracy on training data: 1.0


In [39]:
#prediction on test data
prediction_on_test_data=rf_model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)

In [40]:
print('Accuracy on test data:',accuracy_on_test_data)
#the reason for checking accuracy on both training and test data is to prevent any conditions that lead to overfittingA

Accuracy on test data: 0.9865470852017937


In [41]:
print("\nClassification Report:\n",classification_report(Y_test,prediction_on_test_data))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.89      0.94       139
           1       0.98      1.00      0.99       976

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



Building a Predictive System

In [42]:
input_mail=["Click on this link.You will get 10,000 rupees after you share this message with 3 people."]
#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making predictions
prediction = rf_model.predict(input_data_features)
print(prediction)

[0]


In [43]:
if prediction[0]==1:
    print('Ham mail')
else:
    print('Spam mail')

Spam mail


Decision Tree

In [44]:
dt_model=DecisionTreeClassifier()

In [45]:
#training the decision tree with training data
dt_model.fit(X_train_features,Y_train)

Evaluating the trained model

In [46]:
#prediction on training data
prediction_on_training_data=dt_model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [47]:
print('Accuracy on training data:',accuracy_on_training_data)

Accuracy on training data: 1.0


In [48]:
#prediction on test data
prediction_on_test_data=dt_model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)

In [49]:
print('Accuracy on test data:',accuracy_on_test_data)
#the reason for checking accuracy on both training and test data is to prevent any conditions that lead to overfitting

Accuracy on test data: 0.9596412556053812


In [50]:
print("\nClassification Report:\n",classification_report(Y_test,prediction_on_test_data))


Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.80      0.83       139
           1       0.97      0.98      0.98       976

    accuracy                           0.96      1115
   macro avg       0.92      0.89      0.90      1115
weighted avg       0.96      0.96      0.96      1115



Building a predictive system

In [51]:
input_mail=["I call you later, don't have network. If urgnt, sms me."]
#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making predictions
prediction = dt_model.predict(input_data_features)
print(prediction)

[1]


In [52]:
if prediction[0]==1:
    print('Ham mail')
else:
    print('Spam mail')

Ham mail


Support Vector Machine

In [53]:
svm_model=SVC(kernel='linear')

In [54]:
svm_model.fit(X_train_features,Y_train)

Evaluating the trained model

In [55]:
#prediction on training data
prediction_on_training_data=svm_model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [56]:
print('Accuracy on training data:',accuracy_on_training_data)

Accuracy on training data: 0.9964125560538116


In [57]:
#prediction on test data
prediction_on_test_data=svm_model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)

In [58]:
print('Accuracy on test data:',accuracy_on_test_data)
#the reason for checking accuracy on both training and test data is to prevent any conditions that lead to overfitting

Accuracy on test data: 0.9919282511210762


In [59]:
print("\nClassification Report:\n",classification_report(Y_test,prediction_on_test_data))


Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.97       139
           1       0.99      1.00      1.00       976

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



Building a predictive system

In [60]:
input_mail=["I call you later, don't have network. If urgnt, sms me."]
#convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

#making predictions
prediction = svm_model.predict(input_data_features)
print(prediction)

[1]


In [61]:
if prediction[0]==1:
    print('Ham mail')
else:
    print('Spam mail')

Ham mail


In [62]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(kernel='linear')
}

test_accuracies = {}

def train_and_evaluate_model(model, model_name):
    model.fit(X_train_features, Y_train)
    train_acc = accuracy_score(Y_train, model.predict(X_train_features))
    test_acc = accuracy_score(Y_test, model.predict(X_test_features))
    print(f"{model_name} - Accuracy on training data: {train_acc}")
    print(f"{model_name} - Accuracy on test data: {test_acc}")
    print(f"\n{model_name} - Classification Report:\n", classification_report(Y_test, model.predict(X_test_features)))
    test_accuracies[model_name] = test_acc
    return model

trained_models = {}
for name, clf in models.items():
    print(f"\nTraining {name}...\n")
    trained_models[name] = train_and_evaluate_model(clf, name)

# Select the best model based on test accuracy
best_model_name = max(test_accuracies, key=test_accuracies.get)
best_model = trained_models[best_model_name]

print(f"\nThe best model is: {best_model_name} with test accuracy: {test_accuracies[best_model_name]:.2f}")

# Function to predict new data using the best model
def predict_with_best_model(input_mail):
    input_data_features = feature_extraction.transform(input_mail)
    prediction = best_model.predict(input_data_features)
    return "Ham mail" if prediction[0] == 1 else "Spam mail"

# Example prediction
input_mail = ["Congratulations! You've won a $1,000 gift card. Click here to claim your prize."]
prediction_result = predict_with_best_model(input_mail)
print(f"\nPrediction for the input mail: {prediction_result}")



Training Logistic Regression...

Logistic Regression - Accuracy on training data: 0.9663677130044843
Logistic Regression - Accuracy on test data: 0.9650224215246637

Logistic Regression - Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.73      0.84       139
           1       0.96      1.00      0.98       976

    accuracy                           0.97      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.97      0.97      0.96      1115


Training Random Forest...

Random Forest - Accuracy on training data: 1.0
Random Forest - Accuracy on test data: 0.9874439461883409

Random Forest - Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95       139
           1       0.99      1.00      0.99       976

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg  