In [1]:
import nltk as nl
import pandas as pd
import re, string
import Vectorization.altgrad as altg
import numpy as np
import sklearn as skl
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, hstack
import re, string
%matplotlib inline

In [2]:
## Read dataset
train = pd.read_csv('data/challenge_21_data_2/train_sample_stemmed.csv', index_col=None, engine='c')
# test = pd.read_csv('data/challenge_21_data_2/test_sample_stemmed.csv', index_col=None, engine='c')

## Create a test set
msk = np.random.rand(len(train)) < 0.75
X_train = train[msk]
X_test = train[~msk]

In [3]:
y_train = X_train.Author.factorize()[0]
y_test = X_test.Author.factorize()[0]
train_docs = list(X_train.StemmedSentence.values)
test_docs = list(X_test.StemmedSentence.values)

In [16]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])


params_grid = {}
params_grid['vect__ngram_range'] = [(1, 1), (1, 2), (1, 3)]
params_grid['tfidf__use_idf'] = (True, False)
params_grid['clf__alpha'] = np.logspace(-10, 2, 20)

gs_clf = GridSearchCV(text_clf, params_grid, cv=5, n_jobs=40, verbose=1, scoring='accuracy')
gs_clf = gs_clf.fit(train_docs, y_train)
print("Score on train set %.3f"%gs_clf.score(train_docs, y_train))
print("Score on test set %.3f"%gs_clf.score(test_docs, y_test))

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=40)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=40)]: Done  50 jobs       | elapsed:    6.7s
[Parallel(n_jobs=40)]: Done 200 jobs       | elapsed:   27.0s
[Parallel(n_jobs=40)]: Done 450 jobs       | elapsed:  1.0min
[Parallel(n_jobs=40)]: Done 522 out of 600 | elapsed:  1.2min remaining:   10.8s
[Parallel(n_jobs=40)]: Done 600 out of 600 | elapsed:  1.4min finished


Score on train set 0.992
Score on test set 0.784


In [17]:
gs_clf.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), prep...se_idf=True)), ('clf', MultinomialNB(alpha=0.016237767391887176, class_prior=None, fit_prior=True))])

In [14]:
## Feature creation
feat_engine = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
])

X_train_ngrams = feat_engine.fit_transform(train_docs)
X_test_ngrams = feat_engine.transform(test_docs)

params_grid = {}
params_grid['alpha'] = np.logspace(-5, -2, 30)
clf = GridSearchCV(MultinomialNB(), params_grid, cv=5, n_jobs=40, verbose=1, scoring='accuracy')
clf.fit(X_train_ngrams, y_train)
print("Score on train set %.3f"%clf.score(X_train_ngrams, y_train))
print("Score on test set %.3f"%clf.score(X_test_ngrams, y_test))

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=40)]: Done   1 jobs       | elapsed:    0.2s
[Parallel(n_jobs=40)]: Done  50 jobs       | elapsed:    2.1s
[Parallel(n_jobs=40)]: Done  72 out of 150 | elapsed:    3.0s remaining:    3.2s
[Parallel(n_jobs=40)]: Done 150 out of 150 | elapsed:    5.6s finished


Score on train set 0.988
Score on test set 0.788


In [15]:
clf.best_estimator_

MultinomialNB(alpha=0.0078804628156699131, class_prior=None, fit_prior=True)

In [18]:
## Feature creation
feat_engine = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer(use_idf=True)),
])

X_train_ngrams = feat_engine.fit_transform(train_docs)
X_test_ngrams = feat_engine.transform(test_docs)

params_grid = {}
params_grid['alpha'] = np.logspace(-5, -2, 30)
clf = GridSearchCV(MultinomialNB(), params_grid, cv=5, n_jobs=40, verbose=1, scoring='accuracy')
clf.fit(X_train_ngrams, y_train)
print("Score on train set %.3f"%clf.score(X_train_ngrams, y_train))
print("Score on test set %.3f"%clf.score(X_test_ngrams, y_test))

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=40)]: Done   1 jobs       | elapsed:    0.2s
[Parallel(n_jobs=40)]: Done  50 jobs       | elapsed:    2.1s
[Parallel(n_jobs=40)]: Done  72 out of 150 | elapsed:    2.8s remaining:    3.0s
[Parallel(n_jobs=40)]: Done 150 out of 150 | elapsed:    5.3s finished


Score on train set 0.993
Score on test set 0.782


In [19]:
clf.best_estimator_

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)