In [9]:
# Import required libraries
import pandas as pd
import string
from pprint import pprint
from time import time

In [26]:
# Import the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df_spam_collection = df[['v1', 'v2']]
df_spam_collection.rename(columns={'v1':'response', 'v2':'message'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_spam_collection.rename(columns={'v1':'response', 'v2':'message'}, inplace=True)


In [27]:
# View first 5 records with head method
df_spam_collection.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
# Import text processing libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Import SGD Classifier
from sklearn.linear_model import SGDClassifier

# Import for gridsearch
from sklearn.model_selection import GridSearchCV

# Import for pipeline
from sklearn.pipeline import Pipeline

# Define the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier())
])

In [33]:
# Parameters for gridsearch
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1e-4, 1e-3, 1e-2],
    'clf__penalty': ['l2', 'elasticnet'],
    'clf__max_iter': [1000]
}

In [34]:
# Perform the gridsearch with pipeline and parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print('Performing the grid search now...')
print('Parameters')
pprint(parameters)
t0 = time()
grid_search.fit(df_spam_collection['message'], df_spam_collection['response'])
print('Done in %0.3fs'% (time()-t0))
print()

Performing the grid search now...
Parameters
{'clf__alpha': [0.0001, 0.001, 0.01],
 'clf__max_iter': [1000],
 'clf__penalty': ['l2', 'elasticnet'],
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__ngram_range': [(1, 1), (1, 2)]}
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Done in 12.599s



In [35]:
# Display best parameters and best score
print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

Best score: 0.988
Best parameters set:
	clf__alpha: 0.0001
	clf__max_iter: 1000
	clf__penalty: 'l2'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__ngram_range: (1, 2)
