In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from nltk.tokenize import word_tokenize

%matplotlib inline

In [10]:
#Loading in the data
spam = pd.read_csv('spam.csv',encoding='ISO-8859-1')

In [11]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
# Altering the column title names
spam = spam.rename(columns={'v1':'class','v2':'sms'})

In [16]:
spam.head()

Unnamed: 0,class,sms,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [19]:
# Changing the classes to 1 or 0 to preprocess them.
spam['class'] = spam['class'].map({'ham':0,'spam':1})

In [20]:
spam.head()


Unnamed: 0,class,sms,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,


In [21]:
X = spam['sms']
y = spam['class']

In [25]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielcecchin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,random_state=24,test_size=0.25)

In [28]:
y_test.value_counts(normalize=True)

0    0.865757
1    0.134243
Name: class, dtype: float64

In [29]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression()),
    
])

In [35]:
#Estimating how well it performs before doing anything
cross_val_score(pipe, X_train, y_train, cv=5).mean() 

0.9801395295533336

In [36]:
# Fit your model
pipe.fit(X_train, y_train)


Pipeline(steps=[('cvec', CountVectorizer()), ('lr', LogisticRegression())])

In [37]:
# Training score
pipe.score(X_train, y_train)

0.9978463747307968

In [38]:
# Test score
pipe.score(X_test, y_test)

0.9777458722182341

In [39]:
# Maximum number of features fit: 2000, 3000, 4000, 5000
# Minimum number of documents needed to include token: 2, 3
# Maximum number of documents needed to include token: 90%, 95%
# Check (individual tokens) and also check (individual tokens and bigrams).

pipe_params = {'cvec__max_features' : [2000, 3000, 4000, 5000],
              'cvec__min_df' : [2, 3],
              'cvec__max_df' : [.9, .95],
              'cvec__ngram_range' :[(1,1), (1,2),(1,3)]}

In [40]:
# Instantiate GridSearchCV.

gs = GridSearchCV(pipe, 
                  param_grid=pipe_params, 
                  cv = 5) 

In [41]:
gs.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr', LogisticRegression())]),
             param_grid={'cvec__max_df': [0.9, 0.95],
                         'cvec__max_features': [2000, 3000, 4000, 5000],
                         'cvec__min_df': [2, 3],
                         'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)]})

In [42]:
gs.best_score_

0.9806179984528551

In [43]:
gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 3000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1)}

In [44]:
gs_model = gs.best_estimator_

In [45]:
gs_model.score(X_test, y_test)

0.9784637473079684