In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('mail_data.csv')

In [3]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
data = df.where((pd.notnull(df)), '')

In [5]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.loc[data['Category'] == 'spam', 'Category',] = 0    # here we locate if 0 then it is spam
data.loc[data['Category'] == 'ham', 'Category',] = 1     # here we locate if 1 then it is ham

In [7]:
X = data['Message']                               # here we catorize thar X means data and Y means category
Y = data['Category']

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state = 3)  # here we define elements of x and y in test and train 

# catogries and at last we split train and test thing where test size is .2 and rest is .8

In [9]:
print(X.shape)                        # .shope shows total number of elements in x
print(X_train.shape)                  # _train.shape shoes total number of element under X_train catorgeries
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [10]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(4457,)
(1115,)


In [11]:
# transform text data to features vectors that can be used as input to logistic regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train_features = feature_extraction.fit_transform(X_train)            #here X_train_features means data of X_train in language that understand by ML
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [12]:
model  = LogisticRegression()

In [13]:
# training model with data
model.fit(X_train_features, Y_train)

In [14]:
# predection on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [15]:
print('accuracy_on_training_data', accuracy_on_training_data)
print('prediction_on_training_data', prediction_on_training_data)

accuracy_on_training_data 0.9670181736594121
prediction_on_training_data [1 1 1 ... 1 1 0]


In [16]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [17]:
print('accuracy_on_test_data', accuracy_on_test_data)
print('prediction_on_test_data', prediction_on_test_data)

accuracy_on_test_data 0.9659192825112107
prediction_on_test_data [0 1 1 ... 1 1 1]


In [18]:
Input_mail = ["hello i am yash"]
Input_mail_features = feature_extraction.transform(Input_mail)

prediction = model.predict(Input_mail_features)
print(prediction)

if prediction[0] == 1:
  print('Ham mail')

elif prediction[0] == 0:
  print('Spam mail')

[1]
Ham mail
