In [2]:
# grid search

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())
])

parameters = {
    'vect__max_df': (0.25, 0.5, 0.75),
    'vect__stop_words': ('english', None),
    'vect__max_features': (2500, 5000, 10000, None),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
    'vect__norm': ('l1', 'l2'),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10),
}

In [5]:
df = pd.read_csv('/home/anshul/MMLSL/chapter06/SMSSpamCollection', delimiter='\t', header=None)
X = df[1].values
y = df[0].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1536 candidates, totalling 4608 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1322 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1772 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2322 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 2972 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 3722 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 4572 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 4608 out of 4608 | elapsed:  9.3min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), 'vect__max_features': (2500, 5000, 10000, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'vect__norm': ('l1', 'l2'), 'clf__penalty': ('l1', 'l2'), 'clf__C': (0.01, 0.1, 1, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [10]:
print('Best Score: {0:.3f}'.format(grid_search.best_score_))
print('\nBest Parameter Set: ')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('{}: {}'.format(param_name, best_parameters[param_name]))

Best Score: 0.985

Best Parameter Set: 
clf__C: 10
clf__penalty: l2
vect__max_df: 0.25
vect__max_features: 5000
vect__ngram_range: (1, 2)
vect__norm: l2
vect__stop_words: None
vect__use_idf: True


In [13]:
predictions = grid_search.predict(X_test)

print('Accuracy: {0:.3f}'.format(accuracy_score(y_test, predictions)))
print('Precision: {0:.3f}'.format(precision_score(y_test, predictions)))
print('Recall: {0:.3f}'.format(recall_score(y_test, predictions)))
print('F1-Score: {0:.3f}'.format(f1_score(y_test, predictions)))

Accuracy: 0.988
Precision: 0.994
Recall: 0.914
F1-Score: 0.953
