In [29]:
import re

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

In [7]:
df = pd.read_csv("data/emailSpam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [12]:
df = df[['Message', 'spam']]
df.sample(5)

Unnamed: 0,Message,spam
5155,MY NEW YEARS EVE WAS OK. I WENT TO A PARTY WIT...,0
3305,IM GONNAMISSU SO MUCH!!I WOULD SAY IL SEND U A...,0
27,Did you catch the bus ? Are you frying an egg ...,0
1773,Good afternoon sexy buns! How goes the job sea...,0
4676,"Hi babe its Chloe, how r u? I was smashed on s...",1


In [13]:
df.shape

(5572, 2)

In [None]:
# remove stop words for word clouds

In [15]:
# stopW = stopwords.words('english')
# df['test_without_sw'] = df['Message'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopW]))
# df.sample()

Unnamed: 0,Message,spam,test_without_sw
3768,"Sir Goodmorning, Once free call me.",0,"Sir Goodmorning, Once free call me."


In [24]:
X = df["Message"]
# type(X)
X.sample(5)

3368              Hey what are you doing. Y no reply pa..
3731    I guess you could be as good an excuse as any,...
1362    Shuhui say change 2 suntec steamboat? U noe wh...
334     Any chance you might have had with me evaporat...
4782    Yup... Hey then one day on fri we can ask miwa...
Name: Message, dtype: object

In [25]:
y = df["spam"]
y.sample(5)

4452    0
2022    0
3715    0
2604    0
64      0
Name: spam, dtype: int64

In [None]:
# clean text before feeding into model

In [27]:
def clean_text(doc):
    document = re.sub('[^a-zA-Z]', ' ', doc)
    document = re.sub(r"\s+[a-zA-Z]\s+", ' ', document)
    document = re.sub(r'\s+', ' ', document)
    return document

In [30]:
X_sentences = []
reviews = list(X) 
for rev in reviews:
    X_sentences.append(clean_text(rev))
    

In [31]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(X_sentences).toarray()

In [32]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [54]:
spam_detector = LogisticRegression()
LR = spam_detector.fit(X_train, y_train)# frozen

In [39]:
y_preds = spam_detector.predict(X_test) 

In [58]:
# y_preds[:10]

In [None]:
# single record

In [45]:
X_sentences[56]
# y[56]

'Congrats year special cinema pass for is yours call now Suprman Matrix StarWars etc all FREE bx ip we pm Dont miss out '

In [46]:
print(spam_detector.predict(vectorizer.transform([X_sentences[56]])))

[1]


In [48]:
y[56]

1

In [59]:
y_preds[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [56]:
y_preds_prob = LR.predict_proba(X_test)
y_preds_prob[:10]

array([[0.94198537, 0.05801463],
       [0.89600956, 0.10399044],
       [0.96775425, 0.03224575],
       [0.98168964, 0.01831036],
       [0.8800352 , 0.1199648 ],
       [0.96480329, 0.03519671],
       [0.97382081, 0.02617919],
       [0.9806947 , 0.0193053 ],
       [0.98023484, 0.01976516],
       [0.95405435, 0.04594565]])

In [None]:
# interpretation: col1 probaility obervation belongs to class 0 cols2 probability it belongs to class 1

In [63]:
len(LR.coef_[0])

1624

In [64]:
from sklearn.metrics import log_loss

In [65]:
log_loss(y_test, y_preds_prob)

0.11239266482800558

In [None]:
# a low log loss shows the assigned class probabilities and class labels result in an accurate prediction 
# using logistic regression