In [1]:
import pandas as pd
import numpy as np

# Load the raw data


pd.read_csv('spam.csv') # raises unicode decode error

pd.read_csv('spam.csv', encoding='Latin-1') # works 
    

In [2]:
# when encoding is unknown
import chardet 
with open('spam.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large

print(result['encoding'])
df_raw = pd.read_csv('spam.csv', encoding=result['encoding'])

Windows-1252


In [3]:
df_raw.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Clean the raw data

In [4]:
df = df_raw[['v1', 'v2']].rename(columns={'v1' : 'label', 'v2' : 'sms'})

In [5]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Explore the clean data

In [6]:
df.describe()

Unnamed: 0,label,sms
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
df.drop_duplicates(subset='sms', inplace=True)

In [8]:
df.describe()

Unnamed: 0,label,sms
count,5169,5169
unique,2,5169
top,ham,i cant talk to you now.i will call when i can....
freq,4516,1


In [9]:
df.groupby('label').count().reset_index()

Unnamed: 0,label,sms
0,ham,4516
1,spam,653


In [10]:
df['len'] = df['sms'].map(lambda x: len(x))

In [11]:
df.groupby('label').agg({'sms': 'count', 'len' : 'mean'}).reset_index()

Unnamed: 0,label,sms,len
0,ham,4516,70.459256
1,spam,653,137.891271


# Separate training data and test data

In [71]:
"""
    Train-test split: Do not touch the test data until the time of final evaluation.
        Which stage of the pipeline do we reserve the test data ?
            - after obtaining raw data but before cleaning it 
            - after cleaning but before exploring
            - after exploring but before featurizing/training
"""

from sklearn.model_selection import train_test_split

X = df['sms']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [72]:
print("Shape of X is {}".format(X.shape))
print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

Shape of X is (5169,)
Shape of X_train is (3876,) and shape of y_train is (3876,)
Shape of X_test is (1293,) and shape of y_test is (1293,)


In [73]:
train_corpus = list(X_train)

# Build a featurizer from training data

In [74]:
"""
    Featurizer: Train the featurizer on train data.
"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(train_corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [75]:
print("Number of features = {}".format(len(vectorizer.vocabulary_)))
print("Number of omitted words = {}".format(len(vectorizer.stop_words_)))

Number of features = 5000
Number of omitted words = 2395


In [76]:
X_train_text_features = vectorizer.transform(list(X_train))
print("Shape of X_train_text_features is {}".format(X_train_text_features.shape))

Shape of X_train_text_features is (3876, 5000)


# Adding new features

In [79]:
"""
    Adding a new feature
        Do we need to do the train-test split again ?
        >> Yes, if you do not have the new feature in your previous dataframe.
        Otherwise you have to join the corresponding indices.
        
        Do we need to normalize the new feature being added ?
        >> It depends on the model.
            - perhaps decision tree models would not require normalization
            - linear models would require normalization
"""

df['len'] = df['sms'].map(lambda x: len(x))

X = df[['sms', 'len']]
y = df['label']

X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state=42)

train_corpus = list(X_train['sms'])

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(train_corpus)

from scipy import sparse
def get_features(X):
    X_text_features = vectorizer.transform(list(X['sms']))
    X_len_features = sparse.csr_matrix(X['len']).T
    X_features = sparse.hstack([X_text_features, X_len_features])
    return X_features

X_train_features = get_features(X_train)

# Train a model

In [80]:
"""
    Training a classifier: Train a Logistic Regression classifier
"""
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)

model.fit(X_train_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [81]:
"""
    Evaluation on training data: This does not mean anything  
        Do not take the evaluate on the same data that you trained on very seriously
"""
from sklearn.metrics import accuracy_score, confusion_matrix

y_train_predicted = model.predict(X_train_features)

print("The fraction of correctly classified samples is {}".format(accuracy_score(y_train, y_train_predicted)))
print("The number of correctly classified samples is {}".format(accuracy_score(y_train, 
                                                                               y_train_predicted, normalize=False)))

pd.DataFrame(confusion_matrix(y_train_predicted, y_train), 
             index={'true ham', 'true spam'}, 
             columns={'pred ham', 'pred spam'})

The fraction of correctly classified samples is 0.9760061919504643
The number of correctly classified samples is 3783


Unnamed: 0,pred ham,pred spam
true ham,3399,83
true spam,10,384


# Cross validation

In [91]:
"""
    Evaluation within training data: k-fold cross validation
        - randomly partition the training data into k parts
        - train on k-1 parts and evaluate on the remaining part
"""

from sklearn.model_selection import cross_val_score

lr_model = LogisticRegression()
cv_scores = cross_val_score(lr_model, X=X_train_features, y=y_train, cv=5, n_jobs=4)
print(cv_scores)


[0.95618557 0.95489691 0.97290323 0.96258065 0.96640827]


'\n# The LogisticRegressionCV classifier has inbuilt cross validation\nfrom sklearn.linear_model import LogisticRegressionCV\ncv_model = LogisticRegressionCV(cv=5, random_state=42)\ncv_model.fit(X_train_features, y_train)\ncv_model.scores_\n'

In [93]:
"""
The LogisticRegressionCV classifier has inbuilt cross validation
"""
"""
from sklearn.linear_model import LogisticRegressionCV
cv_model = LogisticRegressionCV(cv=5, random_state=42)
cv_model.fit(X_train_features, y_train)
cv_model.scores_
"""

'\nfrom sklearn.linear_model import LogisticRegressionCV\ncv_model = LogisticRegressionCV(cv=5, random_state=42)\ncv_model.fit(X_train_features, y_train)\ncv_model.scores_\n'

# Final evaluation on test data

In [83]:
"""
    Evaluation on test data: This score is important
"""
X_test_features = get_features(X_test)
y_test_predicted = model.predict(X_test_features)

print("The fraction of correctly classified samples is {}".format(accuracy_score(y_test, y_test_predicted)))
print("The number of correctly classified samples is {}".format(accuracy_score(y_test, y_test_predicted, normalize=False)))

pd.DataFrame(confusion_matrix(y_test_predicted, y_test), 
             index={'true ham', 'true spam'}, 
             columns={'pred ham', 'pred spam'})

The fraction of correctly classified samples is 0.9737045630317092
The number of correctly classified samples is 1259


Unnamed: 0,pred ham,pred spam
true ham,1106,33
true spam,1,153


# Serializing and saving the model

In [118]:
from sklearn.externals import joblib
from datetime import date

"""
    Versioning: It is often useful to save the model with a version number.
        One way to do it is to attach current date/time to the filename saved
"""
version = date.today().strftime("%Y_%B_%d")
joblib.dump(vectorizer, 'vectorizer_joblib_{}'.format(version))
joblib.dump(model, 'lr_model_joblib_{}'.format(version))

"""
    How to serialize a function ? 
        Would the following work ?
"""
joblib.dump(get_features, 'featurizer_joblib_{}'.format(version))


['featurizer_joblib_2018_December_24']

In [102]:
reloaded_model = joblib.load('lr_model_joblib_2018_December_24')
y_test_predicted = reloaded_model.predict(X_test_features)

print("The fraction of correctly classified samples is {}".format(accuracy_score(y_test, y_test_predicted)))
print("The number of correctly classified samples is {}".format(accuracy_score(y_test, y_test_predicted, normalize=False)))

pd.DataFrame(confusion_matrix(y_test_predicted, y_test), 
             index={'true ham', 'true spam'}, 
             columns={'pred ham', 'pred spam'})

The fraction of correctly classified samples is 0.9737045630317092
The number of correctly classified samples is 1259


Unnamed: 0,pred ham,pred spam
true ham,1106,33
true spam,1,153


In [117]:
reloaded_featurizer = joblib.load('featurizer_joblib_2018_December_24')
reloaded_vectorizer = joblib.load('vectorizer_joblib_2018_December_24')
reloaded_model = joblib.load('lr_model_joblib_2018_December_24')

text = "I am not spam."

X_text = pd.DataFrame({'sms': [text], 'len': [len(text)]})
reloaded_model.predict(reloaded_featurizer(X_text))


array(['ham'], dtype=object)

# Providing flask-API 

# Creating a docker image of the flask-app

# Deploying the docker image to AWS

# References
- How to handle class imbalance?
https://elitedatascience.com/imbalanced-classes

# Training multiple models

https://www.kaggle.com/muzzzdy/sms-spam-detection-with-various-classifiers


In [27]:
"""
    Observing cross validation scores of several models    
"""
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

from collections import defaultdict

models = defaultdict()

logistic_regression = LogisticRegression()
models['logistic_regression'] = logistic_regression

decision_tree = DecisionTreeClassifier()
models['decision_tree'] = decision_tree

random_forest = RandomForestClassifier()
models['random_forest'] = random_forest


for key, model in models.items():
    cv_score = cross_val_score(model, X_train_features, y_train, cv=5, n_jobs=4).mean()
    print(cv_score)
    models[key] = {'model': model, 'cv_score': cv_score}
    
df_scores = pd.DataFrame.from_dict(models, orient='index', columns=['cv_score'])    

  from numpy.core.umath_tests import inner1d


0.9540774549307429
0.9597558779729296
0.9628476629466645


In [30]:
"""
    Training a decision tree of various depth and printing cross validation scores
"""
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

d_max = 3

cv_scores = []
for d in range(1, d_max+1):
    model = DecisionTreeClassifier(max_depth=d)
    scores = cross_val_score(model, X=X_train_features, y=y_train, cv=5, n_jobs=4)
    cv_scores.append((scores.mean(), scores.std()))
    
for mean, std in cv_scores:
    print("mean = {} std = {}".format(mean, std))

mean = 0.9004073114498556 std = 0.006677812504596322
mean = 0.93188588727938 std = 0.005764552123775337
mean = 0.9393707645124338 std = 0.006049534544937767
