In [1]:
import warnings
from tempfile import mkdtemp
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD 
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.externals.joblib import Memory
from clean import ReviewCleaner

In [2]:
# creating memory for pipeline to avoid redundant processing.
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cachedir = mkdtemp()
    memory = Memory(cachedir=cachedir, verbose=10)

In [3]:
# cleaner for cleaning reviews.
cleaner = ReviewCleaner()
# vectoriser for word embedding.
vectoriser = TfidfVectorizer()
# decomposer for data compression.
decomposer = TruncatedSVD(n_components=100)
# selector for dimensionality reduction.
selector = SelectKBest(mutual_info_classif, k=10)
# predictor for modelling target.
predictor = SVC()

In [5]:
# loading training data.
train = pd.read_csv("../data/raw/labeledTrainData.tsv", sep='\t')
X_train = train["review"]
y_train = train["sentiment"].astype(bool)

In [6]:
# loading testing data.
test = pd.read_csv("../data/raw/testData.tsv", sep='\t')
X_test = test["review"]

In [8]:
# creating pipeline.
p = Pipeline([
    ('cleaner', cleaner),
    ('vectoriser', vectoriser),
    ('decomposer', decomposer),
    ('selector', selector),
    ('predictor', predictor)
], memory=memory)

In [9]:
# defining parameters for grid search.
params = {
    "vectoriser__ngram_range": [(1, 1), (1, 2)],
    "predictor__C": 10.**np.arange(-2, 3)
}

In [10]:
# defining grid search with cross validation.
learner = GridSearchCV(p, params, cv=2, n_jobs=-1)

In [13]:
# fitting learner to training data (only a sample of 100 to accelerate learning process).
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    learner.fit(X_train.head(100), y_train.head(100))

[Memory]0.0s, 0.0min    : Loading _fit_transform_one from C:\Users\Jan\AppData\Local\Temp\tmpqi1fny7a\joblib\sklearn\pipeline\_fit_transform_one\94f528d72cd78612791234c5d06eff36
___________________________________fit_transform_one cache loaded - 0.0s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None), 
0     stuff moment mj start listen music watch odd d...
1     classic war world timothy hines entertaining f...
2     film start manager

In [16]:
# show benchmark results.
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    cv_results = pd.DataFrame(learner.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_predictor__C,param_vectoriser__ngram_range,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,0.077291,0.004488,2.083854,0.116115,0.01,"(1, 1)","{'predictor__C': 0.01, 'vectoriser__ngram_rang...",0.54902,0.55102,0.55,0.001,4,0.55102,0.54902,0.55002,0.001
1,0.171042,0.005485,2.092723,0.165722,0.01,"(1, 2)","{'predictor__C': 0.01, 'vectoriser__ngram_rang...",0.54902,0.55102,0.55,0.001,4,0.55102,0.54902,0.55002,0.001
2,0.079287,0.003491,2.096653,0.104976,0.1,"(1, 1)","{'predictor__C': 0.1, 'vectoriser__ngram_range...",0.54902,0.55102,0.55,0.001,4,0.55102,0.54902,0.55002,0.001
3,0.174035,0.003491,2.080418,0.109221,0.1,"(1, 2)","{'predictor__C': 0.1, 'vectoriser__ngram_range...",0.54902,0.55102,0.55,0.001,4,0.55102,0.54902,0.55002,0.001
4,0.13464,0.042885,2.344731,0.042885,1.0,"(1, 1)","{'predictor__C': 1.0, 'vectoriser__ngram_range...",0.54902,0.55102,0.55,0.001,4,0.55102,0.54902,0.55002,0.001
5,0.217915,0.02643,2.374651,0.072805,1.0,"(1, 2)","{'predictor__C': 1.0, 'vectoriser__ngram_range...",0.54902,0.55102,0.55,0.001,4,0.55102,0.54902,0.55002,0.001
6,0.126161,0.008478,2.365675,0.300197,10.0,"(1, 1)","{'predictor__C': 10.0, 'vectoriser__ngram_rang...",0.568627,0.612245,0.59,0.021804,1,0.673469,0.843137,0.758303,0.084834
7,0.186002,0.011469,2.383129,0.01845,10.0,"(1, 2)","{'predictor__C': 10.0, 'vectoriser__ngram_rang...",0.54902,0.612245,0.58,0.031606,2,0.673469,0.843137,0.758303,0.084834
8,0.083776,0.002993,1.541022,0.100731,100.0,"(1, 1)","{'predictor__C': 100.0, 'vectoriser__ngram_ran...",0.54902,0.612245,0.58,0.031606,2,0.857143,0.823529,0.840336,0.016807
9,0.185005,0.010473,1.432814,0.111205,100.0,"(1, 2)","{'predictor__C': 100.0, 'vectoriser__ngram_ran...",0.470588,0.632653,0.55,0.081016,4,0.714286,0.901961,0.808123,0.093838


In [17]:
# predict testing data (only a sample of 100 to accelerate prediction process.)
# the advantage of pipelining is that the raw testing data can be used and all pre-processing is done within it.
learner.predict(X_test.head(100))

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False, False, False,  True, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False])