In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#loading the dataset
df=pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

In [4]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
#Label encoding
df.loc[df['Category']=='spam','Category',]=0
df.loc[df['Category']=='ham','Category',]=1

In [6]:
x=df['Message']
y=df['Category']
print(x)
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [7]:
#splitting
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)
print(x_train.shape,x_test.shape)

(4457,) (1115,)


In [8]:
#Feature Extraction
feature_extract=TfidfVectorizer(min_df=1,stop_words='english')
x_train_feature=feature_extract.fit_transform(x_train)
x_test_feature=feature_extract.transform(x_test)
#convert y_train and y_test as integers
y_train=y_train.astype('int')
y_test=y_test.astype('int')


In [9]:
print(x_train)

3890                    Unlimited texts. Limited minutes.
5553                          Hahaha..use your brain dear
4366    Ujhhhhhhh computer shipped out with address to...
3968    YOU HAVE WON! As a valued Vodafone customer ou...
3771    Love it! The girls at the office may wonder wh...
                              ...                        
3335    That's fine, have him give me a call if he kno...
1099    NO GIFTS!! You trying to get me to throw mysel...
2514    U have won a nokia 6230 plus a free digital ca...
3606                      Jordan got voted out last nite!
2575    Your next amazing xxx PICSFREE1 video will be ...
Name: Message, Length: 4457, dtype: object


In [10]:
print(y_train)

3890    1
5553    1
4366    1
3968    0
3771    1
       ..
3335    1
1099    1
2514    0
3606    1
2575    0
Name: Category, Length: 4457, dtype: int32


In [11]:
print(x_train_feature)

  (0, 6927)	0.48935591439341625
  (0, 6586)	0.44333254982109394
  (0, 3958)	0.6161071828926097
  (0, 4334)	0.42941702167641554
  (1, 3168)	0.5869421390016224
  (1, 6971)	0.4281243465155688
  (1, 1428)	0.5869421390016224
  (1, 2121)	0.35736171430221464
  (2, 6878)	0.35749230587184955
  (2, 1876)	0.28751725124107325
  (2, 5894)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5695)	0.35749230587184955
  (2, 4884)	0.35749230587184955
  (2, 3852)	0.3408491178137899
  (2, 7353)	0.31988118061968496
  (2, 5115)	0.3408491178137899
  (3, 1876)	0.3080768784015236
  (3, 7297)	0.22192369472149484
  (3, 7000)	0.30072945056088285
  (3, 7065)	0.32795623716393424
  (3, 2060)	0.24915048132454623
  (3, 5005)	0.3169028431039865
  (3, 7248)	0.23571908490908416
  (3, 300)	0.2915969875465198
  :	:
  (4454, 4627)	0.3831814754124698
  (4454, 311)	0.19547195974237946
  (4454, 5068)	0.22284357632450164
  (4454, 1094)	0.24862733340971144
  (4454, 1575)	0.20946314330145205
  (4454, 666)	0.2865366032423894

In [12]:
#Train the model
model=LogisticRegression()
model.fit(x_train_feature,y_train)

In [13]:
predict_train_data=model.predict(x_train_feature)
accuracy=accuracy_score(y_train,predict_train_data)
print(accuracy)

0.9685887368184878


In [14]:
predict_test_data=model.predict(x_test_feature)
accuracy_test=accuracy_score(y_test,predict_test_data)
print(accuracy_test)

0.9533632286995516


In [16]:
#predicting a new value
input_mail=["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"
]
input_data=feature_extract.transform(input_mail)
prediction=model.predict(input_data)
print(prediction)
if prediction[0]==0:
    print('Not spam')
else:
    print('Spam')

[0]
Not spam
