In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# loading the data from csv file to a pandas Dataframe
df = pd.read_csv('sms_spam.csv')
print(df)

      type                                               text
0      ham  Hope you are having a good week. Just checking in
1      ham                            K..give back my thanks.
2      ham        Am also doing in cbe only. But have to pay.
3     spam  complimentary 4 STAR Ibiza Holiday or £10,000 ...
4     spam  okmail: Dear Dave this is your final notice to...
...    ...                                                ...
5554   ham  You are a great role model. You are giving so ...
5555   ham  Awesome, I remember the last time we got someb...
5556  spam  If you don't, your prize will go to another cu...
5557  spam  SMS. ac JSco: Energy is high, but u may not kn...
5558   ham                    Shall call now dear having food

[5559 rows x 2 columns]


In [4]:
# replace the null values with a null string
mail_data = df.where((pd.notnull(df)),'')

In [5]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5559, 2)

In [6]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['type'] == 'spam', 'type',] = 0
mail_data.loc[mail_data['type'] == 'ham', 'type',] = 1

In [7]:
# separating the data as texts and type

X = mail_data['text']

Y = mail_data['type']

In [8]:
print(X)

0       Hope you are having a good week. Just checking in
1                                 K..give back my thanks.
2             Am also doing in cbe only. But have to pay.
3       complimentary 4 STAR Ibiza Holiday or £10,000 ...
4       okmail: Dear Dave this is your final notice to...
                              ...                        
5554    You are a great role model. You are giving so ...
5555    Awesome, I remember the last time we got someb...
5556    If you don't, your prize will go to another cu...
5557    SMS. ac JSco: Energy is high, but u may not kn...
5558                      Shall call now dear having food
Name: text, Length: 5559, dtype: object


In [9]:
print(Y)

0       1
1       1
2       1
3       0
4       0
       ..
5554    1
5555    1
5556    0
5557    0
5558    1
Name: type, Length: 5559, dtype: object


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [11]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5559,)
(4447,)
(1112,)


In [12]:
# transform the text data to feature vectors that can be used as input to the Logistic regression 
# (extract words like and, the, are)

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [13]:
print(X_train)

4707    I cant pick the phone right now. Pls send a me...
4401    Save yourself the stress. If the person has a ...
5045                Check wid corect speling i.e. Sarcasm
3250    Dude just saw a parked car with its sunroof po...
4529    Oh... Lk tt den we take e one tt ends at cine ...
                              ...                        
789     Dear Voucher Holder, 2 claim this weeks offer,...
968     She.s fine. I have had difficulties with her p...
1667    500 New Mobiles from 2004, MUST GO! Txt: NOKIA...
3321    Oh right, ok. I'll make sure that i do loads o...
1688    Well done ENGLAND! Get the official poly ringt...
Name: text, Length: 4447, dtype: object


In [14]:
print(X_train_features)

  (0, 4320)	0.41876685731019514
  (0, 5828)	0.3588355063720001
  (0, 5074)	0.4057231918631618
  (0, 5607)	0.42917851946659635
  (0, 5006)	0.39365513182244927
  (0, 5018)	0.43829388061889635
  (1, 5838)	0.23601934358493576
  (1, 4434)	0.24643388023162596
  (1, 2207)	0.2844207018665432
  (1, 3746)	0.16449859313194146
  (1, 779)	0.5412038976893805
  (1, 2346)	0.3836054479910387
  (1, 4982)	0.25388284481340484
  (1, 6323)	0.3657419467625696
  (1, 5744)	0.31734058848096225
  (1, 5828)	0.1913812193576672
  (2, 5729)	0.48135325881795915
  (2, 6177)	0.5048634265915348
  (2, 1954)	0.5048634265915348
  (2, 7258)	0.37432630555671054
  (2, 1692)	0.34409983750955614
  (3, 6443)	0.41244527413925053
  (3, 5132)	0.3981524695842464
  (3, 6411)	0.43258985073616385
  (3, 1593)	0.2863023518499901
  :	:
  (4445, 4750)	0.15248093717353542
  (4445, 4028)	0.15386651025596382
  (4445, 6717)	0.17406918537504995
  (4445, 2118)	0.16343452723506488
  (4445, 3094)	0.15956099390108566
  (4445, 4745)	0.19111225165694

In [15]:
model = LogisticRegression()

In [16]:
# training the Logistic Regression model with the training data

model.fit(X_train_features, Y_train)

LogisticRegression()

In [47]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [48]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9667191364965145


In [50]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [51]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.966726618705036


In [52]:
input_mail = ["Hope you are having a good week. Just checking in"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail


In [53]:
# Generate the confusion matrix
confusion_matrix_result = confusion_matrix(Y_test, prediction_on_test_data)
print("Confusion Matrix:")
print(confusion_matrix_result)

Confusion Matrix:
[[114  34]
 [  3 961]]


In [67]:
from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier(criterion='entropy', random_state=0)
model2.fit(X_train_features, Y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [68]:
prediction_on_test_data = model2.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9631294964028777


In [69]:
prediction = model2.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail


In [70]:
from sklearn.svm import SVC
model4 = SVC(kernel='linear', random_state=0)
model4.fit(X_train_features, Y_train)

SVC(kernel='linear', random_state=0)

In [71]:
prediction_on_test_data = model4.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9838129496402878


In [72]:
prediction = model4.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail


In [73]:
# Generate the confusion matrix
confusion_matrix_result = confusion_matrix(Y_test, prediction_on_test_data)
print("Confusion Matrix:")
print(confusion_matrix_result)

Confusion Matrix:
[[133  15]
 [  3 961]]


In [60]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

bayes = MultinomialNB(alpha=0.01)
bayes.fit(X_train_features, Y_train)

MultinomialNB(alpha=0.01)

In [61]:
prediction_on_test_data = bayes.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9829136690647482


In [62]:
prediction = bayes.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail


In [63]:
# Generate the confusion matrix
confusion_matrix_result = confusion_matrix(Y_test, prediction_on_test_data)
print("Confusion Matrix:")
print(confusion_matrix_result)

Confusion Matrix:
[[135  13]
 [  6 958]]
