In [2]:
# grid search

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())
])

parameters = {
    'vect__max_df': (0.25, 0.5, 0.75),
    'vect__stop_words': ('english', None),
    'vect__max_features': (2500, 5000, 10000, None),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
    'vect__norm': ('l1', 'l2'),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10),
}

In [5]:
df = pd.read_csv('/home/anshul/MMLSL/chapter06/SMSSpamCollection', delimiter='\t', header=None)
X = df[1].values
y = df[0].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1536 candidates, totalling 4608 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1322 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1772 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2322 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 2972 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 3722 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 4572 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 4608 out of 4608 | elapsed:  9.3min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), 'vect__max_features': (2500, 5000, 10000, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'vect__norm': ('l1', 'l2'), 'clf__penalty': ('l1', 'l2'), 'clf__C': (0.01, 0.1, 1, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [10]:
print('Best Score: {0:.3f}'.format(grid_search.best_score_))
print('\nBest Parameter Set: ')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('{}: {}'.format(param_name, best_parameters[param_name]))

Best Score: 0.985

Best Parameter Set: 
clf__C: 10
clf__penalty: l2
vect__max_df: 0.25
vect__max_features: 5000
vect__ngram_range: (1, 2)
vect__norm: l2
vect__stop_words: None
vect__use_idf: True


In [13]:
predictions = grid_search.predict(X_test)

print('Accuracy: {0:.3f}'.format(accuracy_score(y_test, predictions)))
print('Precision: {0:.3f}'.format(precision_score(y_test, predictions)))
print('Recall: {0:.3f}'.format(recall_score(y_test, predictions)))
print('F1-Score: {0:.3f}'.format(f1_score(y_test, predictions)))

Accuracy: 0.988
Precision: 0.994
Recall: 0.914
F1-Score: 0.953


In [2]:
# one-vs-all classification
import pandas as pd
df = pd.read_csv('/home/anshul/MMLSL/chapter06/train.tsv', delimiter='\t', header=0)
print(df.count())

PhraseId      156060
SentenceId    156060
Phrase        156060
Sentiment     156060
dtype: int64


In [3]:
print(df.head())

   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  


In [4]:
print(df['Phrase'].head(10))

0    A series of escapades demonstrating the adage ...
1    A series of escapades demonstrating the adage ...
2                                             A series
3                                                    A
4                                               series
5    of escapades demonstrating the adage that what...
6                                                   of
7    escapades demonstrating the adage that what is...
8                                            escapades
9    demonstrating the adage that what is good for ...
Name: Phrase, dtype: object


In [5]:
# examining the target/response variable - Sentiment
print(df['Sentiment'].describe())

count    156060.000000
mean          2.063578
std           0.893832
min           0.000000
25%           2.000000
50%           2.000000
75%           3.000000
max           4.000000
Name: Sentiment, dtype: float64


In [6]:
print(df['Sentiment'].value_counts())

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64


In [8]:
# print('{0:.2f}'.format((df['Sentiment'].value_counts() / df['Sentiment'].count()) * 100))
print((df['Sentiment'].value_counts() / df['Sentiment'].count()) * 100)

2    50.994489
3    21.098936
1    17.475971
4     5.899013
0     4.531590
Name: Sentiment, dtype: float64


In [11]:
# training the classifier with scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('/home/anshul/MMLSL/chapter06/train.tsv', delimiter='\t', header=0)
X = df['Phrase'] 
y = df['Sentiment'].as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.5)

# grid_search = main(X_train, y_train)
pipeline = Pipeline([
    ('vect', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())
])

parameters = {
    'vect__max_df': (0.25, 0.5),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
    'clf__C': (0.1, 1, 10),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')

  # This is added back by InteractiveShellApp.init_path()


In [12]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   56.7s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.25, 0.5), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__C': (0.1, 1, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [13]:
print('Best Score: {0:.3f}'.format(grid_search.best_score_))
print('\nBest Parameter Set: ')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('{}: {}'.format(param_name, best_parameters[param_name]))

Best Score: 0.620

Best Parameter Set: 
clf__C: 10
vect__max_df: 0.25
vect__ngram_range: (1, 2)
vect__use_idf: False


In [14]:
predictions = grid_search.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, predictions))
print('Confusion Matrix:')
print(confusion_matrix(y_test, predictions))
print('Classification Report:')
print(classification_report(y_test, predictions))

Accuracy:  0.635768294245803
Confusion Matrix:
[[ 1133  1759   604    61     8]
 [  922  6021  6170   540    33]
 [  203  3188 32601  3584   162]
 [   21   406  6535  8211  1303]
 [    4    38   521  2359  1643]]
Classification Report:
             precision    recall  f1-score   support

          0       0.50      0.32      0.39      3565
          1       0.53      0.44      0.48     13686
          2       0.70      0.82      0.76     39738
          3       0.56      0.50      0.53     16476
          4       0.52      0.36      0.43      4565

avg / total       0.62      0.64      0.62     78030

