In [39]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [40]:
data_path = "./data/mail_data.csv"
df = pd.read_csv(data_path)
#df.shape # check how many rows and columns 
#df.columns # check the columns name
#df.describe() # to see the statistics information
#df.info() : Information about data
#df.Category.value_counts() # Count the values of columns

In [41]:
# label encodding 
df = df.where((pd.notnull(df)),'')

df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'ham', 'Category',] = 1

X = df["Message"]
Y = df["Category"]

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [43]:
model = LogisticRegression()
model.fit(X_train_features,Y_train)

In [44]:
# prediction on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [45]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9685887368184878


In [46]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [47]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9533632286995516


In [48]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_features,Y_train)
# prediction on training data
prediction_on_training_data = dt.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data DT : ', accuracy_on_training_data)
prediction_on_test_data = dt.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data DT: ', accuracy_on_test_data)

Accuracy on training data DT :  1.0
Accuracy on test data DT:  0.9730941704035875


In [49]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_features,Y_train)
prediction_on_training_data = knn.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data KNN : ', accuracy_on_training_data)
prediction_on_test_data = knn.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data KNN: ', accuracy_on_test_data)

Accuracy on training data KNN :  0.9203500112183083
Accuracy on test data KNN:  0.8986547085201794


In [50]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(X_train_features,Y_train)
prediction_on_training_data = forest.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data forest : ', accuracy_on_training_data)
prediction_on_test_data = forest.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data forest: ', accuracy_on_test_data)

Accuracy on training data forest :  1.0
Accuracy on test data forest:  0.9766816143497757


In [51]:
# SVM- Classfiar
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_features,Y_train)
prediction_on_training_data = svc.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data SVC : ', accuracy_on_training_data)
prediction_on_test_data = svc.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data SVC: ', accuracy_on_test_data)

Accuracy on training data SVC :  0.9984294368409243
Accuracy on test data SVC:  0.9730941704035875


In [52]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_features,Y_train)
prediction_on_training_data = nb.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data NB : ', accuracy_on_training_data)
prediction_on_test_data = nb.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data NB: ', accuracy_on_test_data)

Accuracy on training data NB :  0.9838456360780794
Accuracy on test data NB:  0.9614349775784753


In [53]:
input_mail = ["I hope this email finds you well. My name is Anisur Rahman, and I am writing from Bangladesh. I received my I-20 on MS in Artificial intelligence in August (APP-00102794).I noticed that the status in my application portal is still showing as 'initial state,' whereas I believe it should now be in the 'delivery document' stage. I am concerned that there might be an issue. I have attached a screenshot of my portal for your reference."]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = nb.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')


[1]
Ham mail
