In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # Convert the text to numerical values
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data collection and PreProcessing

In [21]:
mail_df = pd.read_csv('../data/mail_data.csv')
mail_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [22]:
mail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [23]:
mail_df.isna().sum()

Category    0
Message     0
dtype: int64

In [24]:
mail_data = mail_df.where(pd.notna(mail_df), '')

In [25]:
mail_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [26]:
mail_df.loc[mail_df['Category'] == 'spam', 'Category'] = 0

In [27]:
mail_df.Category = mail_df.Category.replace({'ham': 1})

In [28]:
mail_df

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [29]:
X = mail_df['Message']
y = mail_df['Category']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [31]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [32]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [34]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [36]:
print(X_train_features)

  (0, 2392)	0.7071067811865476
  (0, 1713)	0.7071067811865476
  (1, 7400)	0.42224074096154146
  (1, 6539)	0.5043342820364713
  (1, 7160)	0.5436306308668579
  (1, 6959)	0.2835353332900749
  (1, 5241)	0.43753073102856876
  (2, 7421)	0.28292332285709515
  (2, 2982)	0.49798152042392235
  (2, 5801)	0.49798152042392235
  (2, 4721)	0.47479780441738884
  (2, 5832)	0.4455897765170157
  (3, 5831)	1.0
  (4, 3177)	0.4237669213702235
  (4, 2788)	0.5342906086326665
  (4, 4157)	0.5534651020906459
  (4, 1583)	0.3549147356597047
  (4, 4738)	0.32041699870439216
  (5, 7105)	0.3146814949645023
  (5, 1757)	0.5045242316078322
  (5, 6415)	0.3065376794168086
  (5, 5720)	0.4810359172341316
  (5, 2297)	0.3741084487077238
  (5, 7131)	0.2506764475916244
  (5, 4009)	0.34391007706121646
  :	:
  (4454, 7410)	0.2934811475046227
  (4454, 1676)	0.30494304541447176
  (4454, 4756)	0.3405469522864689
  (4454, 5743)	0.32190610116129437
  (4455, 4016)	0.5175039529048374
  (4455, 3887)	0.6142237248000665
  (4455, 6123)	0.595

In [37]:
model = LogisticRegression(n_jobs=10)

In [38]:
model.fit(X_train_features, y_train)

In [40]:
X_test_pred = model.predict(X_test_features)
X_test_acc = accuracy_score(X_test_pred, y_test)
X_test_acc

0.9596412556053812