In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [30]:
mail_data = pd.read_csv('mail_data.csv')

In [31]:
#Checking if there is any null values in the Message column
number_of_null_values = mail_data.Message.isnull()
number_of_null_values.value_counts()

False    5572
Name: Message, dtype: int64

In [33]:
#Replacing the ham Category to 1 and the spam Category to 0 
mail_data.Category.replace('ham',1,inplace=True)
mail_data.Category.replace('spam',0,inplace=True)

In [34]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
X = mail_data.Message
Y = mail_data.Category.astype('int')

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int32

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [45]:
#Transforming the text data to feature vectors to use as input to our ML model
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [60]:
print(X_test_features)

  (0, 6619)	0.33077540807715927
  (0, 4752)	0.44421921026428457
  (0, 2494)	0.359541012283057
  (0, 2313)	0.37081499071603014
  (0, 2110)	0.2538341210056606
  (0, 1623)	0.47755798461662824
  (0, 1153)	0.3660464944955722
  (1, 4140)	0.7724156535136
  (1, 3802)	0.40629294786687964
  (1, 3352)	0.4881599110135932
  (2, 3179)	0.3405136304031059
  (2, 3169)	0.9402395798463798
  (3, 6670)	0.4948874540031021
  (3, 6543)	0.5505088255084791
  (3, 2900)	0.6723291165103608
  (4, 7417)	0.4582086641273852
  (4, 6613)	0.6612385994559425
  (4, 5583)	0.3946308162640678
  (4, 1764)	0.443931136059295
  (5, 7144)	0.2525030795568811
  (5, 6017)	0.3435042181615311
  (5, 5522)	0.37192637792006283
  (5, 4761)	0.3253891605505013
  (5, 4161)	0.4423344697815598
  (5, 4048)	0.23654956954038084
  :	:
  (1111, 5132)	0.4888630580390552
  (1111, 5071)	0.3867437918860694
  (1111, 4094)	0.24494882973980492
  (1111, 3138)	0.24402169398619392
  (1111, 3084)	0.24749503861730665
  (1111, 1031)	0.4888630580390552
  (1112, 7

In [51]:
#Training the model with the training data
model = LogisticRegression()
model.fit(X_train_features, Y_train)

In [56]:
#Accuracy score for the training data
training_data_prediction = model.predict(X_train_features)
training_data_prediction_accuracy = accuracy_score(Y_train, training_data_prediction)
training_data_prediction_accuracy

0.9683643706529056

In [58]:
#Accuracy score for testing data
testing_data_prediction = model.predict(X_test_features)
testing_data_prediction_accuracy = accuracy_score(Y_test, testing_data_prediction)
testing_data_prediction_accuracy

0.9524663677130045

In [69]:
def spam_or_not():
    input_mail = input("Enter a mail here: ")
    input_mail_features = feature_extraction.transform([input_mail])
    input_mail_prediction = model.predict(input_mail_features)
    print("\nThe Entered Mail is : ")
    if(input_mail_prediction[0]==0):
        print("Spam Email")
    else:
        print("Ham Email")
    

In [70]:
spam_or_not()

Enter a mail here: SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info

The Entered Mail is : 
Spam Email
