# Logistic regression model

Implement a basic logistic regression model to classify text messages as spam.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats as st
import re

In [2]:
messages = pd.read_csv('spam.csv')
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [3]:
messages['Message'] = messages['Message'].apply(lambda mes: mes.lower())
messages

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ü b going to esplanade fr home?
5569,ham,"pity, * was in mood for that. so...any other s..."
5570,ham,the guy did some bitching but i acted like i'd...


In [4]:
messages['Message'] = messages['Message'].apply(lambda mes: re.sub('[\W_]+',' ', mes))
messages

Unnamed: 0,Category,Message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ü b going to esplanade fr home
5569,ham,pity was in mood for that so any other suggest...
5570,ham,the guy did some bitching but i acted like i d...


In [5]:
messages['Message'] = messages['Message'].apply(lambda message: message.split())

In [6]:
messages

Unnamed: 0,Category,Message
0,ham,"[go, until, jurong, point, crazy, available, o..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,..."
...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,"[will, ü, b, going, to, esplanade, fr, home]"
5569,ham,"[pity, was, in, mood, for, that, so, any, othe..."
5570,ham,"[the, guy, did, some, bitching, but, i, acted,..."


In [11]:
import nltk
from nltk.corpus import stopwords

In [12]:
stopwords_set = set(stopwords.words('english'))

In [13]:
messages['Message'] = messages['Message'].apply(lambda string: [word for word in string if word not in stopwords_set])

In [14]:
messages

Unnamed: 0,Category,Message
0,ham,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, think, goes, usf, lives, around, though]"
...,...,...
5567,spam,"[2nd, time, tried, 2, contact, u, u, 750, poun..."
5568,ham,"[ü, b, going, esplanade, fr, home]"
5569,ham,"[pity, mood, suggestions]"
5570,ham,"[guy, bitching, acted, like, interested, buyin..."


In [19]:
from nltk.stem import WordNetLemmatizer

In [20]:
wordnet_lemmatizer = WordNetLemmatizer()
messages['Message'] = messages['Message'].apply(lambda string: [wordnet_lemmatizer.lemmatize(word) for word in string])

In [21]:
messages

Unnamed: 0,Category,Message
0,ham,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, think, go, usf, life, around, though]"
...,...,...
5567,spam,"[2nd, time, tried, 2, contact, u, u, 750, poun..."
5568,ham,"[ü, b, going, esplanade, fr, home]"
5569,ham,"[pity, mood, suggestion]"
5570,ham,"[guy, bitching, acted, like, interested, buyin..."


In [22]:
messages['Message'] = messages['Message'].apply(lambda string: ' '.join(string))

In [23]:
messages

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though
...,...,...
5567,spam,2nd time tried 2 contact u u 750 pound prize 2...
5568,ham,ü b going esplanade fr home
5569,ham,pity mood suggestion
5570,ham,guy bitching acted like interested buying some...


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(messages.Message)
names = tfidf.get_feature_names()
tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=names)

In [26]:
tfidf_matrix['go']

0       0.154894
1       0.000000
2       0.000000
3       0.000000
4       0.255272
          ...   
5567    0.000000
5568    0.000000
5569    0.000000
5570    0.000000
5571    0.000000
Name: go, Length: 5572, dtype: float64

In [27]:
df = tfidf_matrix.join(messages['Category'])

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, messages['Category'], test_size=0.3, random_state=42)

In [29]:

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.predict(X_test)

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype=object)

In [30]:
result = pd.DataFrame([y_test.values, lr.predict(X_test)]).T
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1672 entries, 0 to 1671
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1672 non-null   object
 1   1       1672 non-null   object
dtypes: object(2)
memory usage: 26.2+ KB


In [31]:
result['dif'] = np.where(result[0] == result[1], 0, 1)

In [32]:
result[result['dif'] == 1]

Unnamed: 0,0,1,dif
17,ham,spam,1
40,spam,ham,1
47,spam,ham,1
74,spam,ham,1
84,spam,ham,1
...,...,...,...
1525,spam,ham,1
1567,spam,ham,1
1569,spam,ham,1
1576,ham,spam,1


In [33]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy_score(y_test, lr.predict(X_test))

0.958732057416268

In [35]:
from sklearn.metrics import confusion_matrix

In [36]:
confusion_matrix(y_test, lr.predict(X_test), labels=["ham", "spam"])

array([[1445,    3],
       [  66,  158]], dtype=int64)

1445 times the model correctly classified non-spam, 3 times the model incorrectly classified non-spam as spam,
66 times the model incorrectly classified spam as non-spam, 158 times the model correctly classified spam.

In [37]:
Accuracy = (1445 + 158)/(1445 + 3 + 66 + 158)
Accuracy

0.958732057416268

In [38]:
Recall = 1445/(1445+3)
Recall

0.9979281767955801

the model can be trusted in its ability to detect non-spam

In [39]:
Precision = 1445 / (1445 + 66)
Precision

0.9563203176704169

the model's assertions that the message is classified as non-spam are correct with an accuracy of 95.6%.

In [40]:
specificity = 158 / (158 + 66)
specificity

0.7053571428571429

but the ability to detect spam is much lower - 70.5%

In [41]:
Predict_Category = pd.Series(lr.predict(X_test)).rename("Predict_Category")
Predict_Category

0        ham
1        ham
2        ham
3        ham
4        ham
        ... 
1667     ham
1668    spam
1669     ham
1670    spam
1671     ham
Name: Predict_Category, Length: 1672, dtype: object

In [42]:
import copy

In [43]:
dif_df = copy.copy(y_test)

In [44]:
dif_df = dif_df.reset_index()
dif_df

Unnamed: 0,index,Category
0,3245,ham
1,944,ham
2,1044,ham
3,2484,ham
4,812,ham
...,...,...
1667,2505,ham
1668,2525,spam
1669,4975,ham
1670,650,spam


In [45]:
dif_df = dif_df.join(Predict_Category)

In [46]:
dif_df = dif_df.set_index('index')

In [47]:
df = pd.read_csv('spam.csv')

In [48]:
dif_df = dif_df.join(df['Message'])

In [49]:
dif_df = dif_df[dif_df['Category'] != dif_df['Predict_Category']].reset_index()
dif_df

Unnamed: 0,index,Category,Predict_Category,Message
0,2952,ham,spam,Hey now am free you can call me.
1,881,spam,ham,Reminder: You have not downloaded the content ...
2,1961,spam,ham,Guess what! Somebody you know secretly fancies...
3,3864,spam,ham,Oh my god! I've found your number again! I'm s...
4,2575,spam,ham,Your next amazing xxx PICSFREE1 video will be ...
...,...,...,...,...
64,4543,spam,ham,FreeMsg Hi baby wow just got a new cam moby. W...
65,752,spam,ham,You have an important customer service announc...
66,309,spam,ham,TheMob> Check out our newest selection of cont...
67,495,ham,spam,Are you free now?can i call now?
