# Spam mail identification

## Load data into Dataframe

In [45]:
import pandas as pd
import numpy as np

# Load data
data = pd.read_csv('spam.csv')

# Create labels
data['target'] = np.where(data['target']=='spam',1,0)

data.head()

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


## TFIDF Vectorizing & base Logistic Regression model training

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], 
                                                    data['target'], 
                                                   random_state=0)
# Fit vectorizer
vect = TfidfVectorizer().fit(X_train)

# Vectorize text
X_train_vectorized = vect.transform(X_train)

# Initiate model
model = LogisticRegression()

# Fit vectorized text to model
model.fit(X_train_vectorized, y_train)

# Predict test set
predictions = model.predict(vect.transform(X_test))

# Print prediction accuracy
print('Base model accuracy: ' + \
      "%.2f" % (accuracy_score(y_test, predictions)*100)+"%")


Base model accuracy: 96.70%


## Average characters feature extraction

In [16]:
# Function to count characters
def charcount(string):
    lenght = len(string)
    return lenght

# Apply function to text
data['characters'] = data['text'].apply(charcount)

# Print average characters for each label
print("Average Characters in Spam: " + str(data['characters'].loc[data['target'] == 1].mean()))
print("Average Characters in Ham: " + str(data['characters'].loc[data['target'] == 0].mean()))

Average Characters in Spam: 138.8661311914324
Average Characters in Ham: 71.02362694300518


## Average digits feature extraction

In [18]:
# Function to count digits
def digitcount(string):
    digitcount = 0
    for c in string:
        if c.isdigit():
            digitcount=digitcount+1
        else:
            pass
    return digitcount

# Apply function to text
data['digits'] = data['text'].apply(digitcount)

# Print average digits for each label
print("Average Digits in Spam: " + str(data['digits'].loc[data['target'] == 1].mean()))
print("Average Digits in Ham: " + str(data['digits'].loc[data['target'] == 0].mean()))

Average Digits in Spam: 15.759036144578314
Average Digits in Ham: 0.2992746113989637


## Average non-word characters feature extraction 

In [20]:
import re

# Function to count non-word characters
def nwcharcount (string):
    nonchar = re.sub('[\w]+' ,'', string)
    return len(nonchar)

# Apply function to text
data['nwchar'] = data['text'].apply(nwcharcount)

# Print average non-word char. for each label
print("Average non-word Char. in Spam: " + str(data['nwchar'].loc[data['target'] == 1].mean()))
print("Average non-word Char. in Ham: " + str(data['nwchar'].loc[data['target'] == 0].mean()))

Average non-word Char. in Spam: 29.041499330655956
Average non-word Char. in Ham: 17.29181347150259


## Grid search Vectorizer and Model parameters via Pipeline

In [43]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,3), (2,5)),
    'tfidf__min_df': (5, 10, 20, 50),
    'tfidf__analyzer': ('word','char','char_wb'),
    'lr__C': (10,50,100),}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

# Print results of grid search
print ("Performing grid search...")
print ("pipeline:", [name for name, _ in pipeline.steps])
grid_search.fit(data.text, data.target) 
print ("Best score: %0.3f" % grid_search.best_score_)
print ("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ("\t%s: %r" % (param_name, best_parameters[param_name]))  

Performing grid search...
pipeline: ['tfidf', 'lr']
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  6.8min finished


Best score: 0.991
Best parameters set:
	lr__C: 100
	tfidf__analyzer: 'char'
	tfidf__min_df: 20
	tfidf__ngram_range: (1, 3)


## Build final model

In [44]:
import scipy
from scipy import sparse
from scipy.sparse import csr_matrix, hstack
from sklearn import cross_validation

# Initiate vectorizer with optimal parameters
vect = TfidfVectorizer(min_df=50, ngram_range=(1,3), analyzer='char').fit(data.text)

# Vectorize text
text_vectorized = vect.transform(data.text)

# Combine vectorized text and features
X_map = hstack([text_vectorized,\
                csr_matrix(data.characters).T,\
                csr_matrix(data.digits).T,\
                csr_matrix(data.nwchar).T], 'csr')

# Initiate model with optimal parameters
model = LogisticRegression(C=100)

# Fit model
model.fit(X_map, data.target)

# Accuracy score 10-fold cross-validated 
Score = (cross_validation.cross_val_score(model, X_map, data.target, cv=10, scoring = "accuracy").mean())*100

# Print model accuracy
print("Final model accuracy: " + "%.2f" % Score +"%")

Final model accuracy: 99.05%
