In [7]:
from timeit import default_timer as now
import pandas as pd
from clean_documents import clean_text

In [2]:
movie_reviews = pd.read_csv("../data/raw/labeledTrainData.tsv", sep='\t')

In [3]:
movie_reviews.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [8]:
start_time = now()
movie_reviews["review"].head(250).apply(lambda x: clean_text(x))
elapsed_time = now() - start_time
print("elapsed time: {}".format(elapsed_time))

elapsed time: 5.375799406970125


In [9]:
movie_reviews["reviews_cleaned"] = movie_reviews["review"].apply(lambda x: clean_text(x))

In [10]:
movie_reviews.head()

Unnamed: 0,id,sentiment,review,reviews_cleaned
0,5814_8,1,With all this stuff going down at the moment w...,"([(stuff, n), (moment, n), (mj, n), (start, v)..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","([(classic, a), (war, n), (world, n), (timothy..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"([(film, n), (start, v), (manager, n), (nichol..."
3,3630_4,0,It must be assumed that those who praised this...,"([(must, None), (assume, v), (praise, v), (fil..."
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,"([(superbly, r), (trashy, a), (wondrously, r),..."


In [20]:
def merge_tokens(tokens):
    return " ".join([t[0] for t in tokens[0]]).strip()

In [21]:
movie_reviews["reviews_joined"] = movie_reviews["reviews_cleaned"].apply(lambda x: merge_tokens(x))

In [22]:
movie_reviews.head()

Unnamed: 0,id,sentiment,review,reviews_cleaned,reviews_joined
0,5814_8,1,With all this stuff going down at the moment w...,"([(stuff, n), (moment, n), (mj, n), (start, v)...",stuff moment mj start listen music watch odd d...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","([(classic, a), (war, n), (world, n), (timothy...",classic war world timothy hines entertaining f...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,"([(film, n), (start, v), (manager, n), (nichol...",film start manager nicholas bell give welcome ...
3,3630_4,0,It must be assumed that those who praised this...,"([(must, None), (assume, v), (praise, v), (fil...",must assume praise film great filmed opera eve...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,"([(superbly, r), (trashy, a), (wondrously, r),...",superbly trashy wondrously unpretentious explo...


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
ngram_range = (1 , 2)
norm = None
smooth_idf = False

In [36]:
vectoriser = TfidfVectorizer(ngram_range=ngram_range, norm=norm, smooth_idf=smooth_idf)

In [37]:
corpus = vectoriser.fit_transform(movie_reviews["reviews_joined"])

In [38]:
corpus

<25000x1630939 sparse matrix of type '<class 'numpy.float64'>'
	with 4937499 stored elements in Compressed Sparse Row format>

In [39]:
from sklearn.decomposition import TruncatedSVD 

In [40]:
n_components = 100

In [41]:
decomposer = TruncatedSVD(n_components=n_components)

In [42]:
X = decomposer.fit_transform(corpus)

In [44]:
X.shape

(25000, 100)

In [50]:
decomposer.singular_values_

array([2056.60065707,  769.29656787,  755.28672767,  646.27436543,
        582.40302571,  572.245981  ,  570.60494743,  565.24810591,
        538.34307206,  530.86984375,  522.3481192 ,  516.5734089 ,
        514.47026609,  498.1590743 ,  493.47289022,  487.80321615,
        483.60361446,  482.85728634,  478.99239186,  477.12554608,
        473.1729996 ,  464.99302744,  461.10982126,  455.67162774,
        453.61288459,  453.29139898,  449.94314923,  447.05789299,
        445.94545623,  445.57960815,  442.40025821,  440.76131874,
        437.25399781,  435.63680119,  434.49635714,  432.69576422,
        432.23414143,  430.42972171,  429.16700691,  424.16544462,
        422.32106113,  421.04422438,  421.00710319,  419.51148914,
        418.43988486,  417.05865183,  414.37131869,  412.79085883,
        411.62234759,  410.38696957,  409.04637947,  406.95240364,
        404.43606369,  403.84161229,  401.39302232,  399.26464411,
        398.51863124,  397.79251264,  396.46845223,  394.65451

In [58]:
decomposer.components_[0, :].argsort()[-5:]

array([ 229523, 1256380,  824567,  943001,  529997], dtype=int64)

In [62]:
import numpy as np

In [79]:
np.array(vectoriser.get_feature_names())[decomposer.components_[1, :].argsort()[:20]]

array(['movie', 'bad', 'watch', 'like', 'think', 'see', 'bad movie',
       'watch movie', 'good', 'guy', 'see movie', 'plot', 'horror',
       'movie like', 'acting', 'movie ever', 'people', 'waste', 'zombie',
       'ever'], dtype='<U47')

In [46]:
decomposer.components_.shape

(100, 1630939)

In [65]:
y = movie_reviews["sentiment"].astype(bool)

In [67]:
y

0         True
1         True
2        False
3        False
4         True
5         True
6        False
7        False
8        False
9         True
10       False
11        True
12        True
13       False
14       False
15       False
16       False
17       False
18        True
19        True
20        True
21        True
22        True
23       False
24       False
25        True
26       False
27       False
28       False
29       False
         ...  
24970     True
24971     True
24972     True
24973    False
24974     True
24975     True
24976    False
24977     True
24978     True
24979     True
24980     True
24981     True
24982    False
24983    False
24984    False
24985    False
24986     True
24987     True
24988     True
24989     True
24990     True
24991    False
24992    False
24993    False
24994    False
24995    False
24996    False
24997    False
24998    False
24999     True
Name: sentiment, Length: 25000, dtype: bool

In [72]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [69]:
k = 10

In [74]:
selector = SelectKBest(mutual_info_classif, k=k)

In [75]:
X_selected = selector.fit_transform(X, y)

In [76]:
X_selected.shape

(25000, 10)

In [77]:
selector.scores_

array([0.00158282, 0.03934382, 0.03070673, 0.03895132, 0.04586751,
       0.00983597, 0.01488179, 0.01193097, 0.01088082, 0.00840758,
       0.01683045, 0.02133985, 0.05812433, 0.0046155 , 0.00724703,
       0.01849373, 0.01866172, 0.00041533, 0.00421381, 0.06223085,
       0.02822222, 0.00993962, 0.00390459, 0.00972193, 0.02970307,
       0.00192243, 0.00518444, 0.02126916, 0.01803394, 0.00502985,
       0.01379013, 0.008665  , 0.00538544, 0.00297313, 0.02136129,
       0.02786858, 0.02081371, 0.00848697, 0.02325845, 0.01309672,
       0.0056454 , 0.01553872, 0.00298788, 0.00486397, 0.00583571,
       0.01080769, 0.00234193, 0.01367626, 0.01551966, 0.0059782 ,
       0.01836166, 0.01990985, 0.01381916, 0.01983934, 0.00475297,
       0.005928  , 0.01188572, 0.01247442, 0.00482416, 0.00929658,
       0.01061125, 0.0091828 , 0.00434968, 0.00421067, 0.01045426,
       0.00514657, 0.0070923 , 0.00926964, 0.01149182, 0.0044816 ,
       0.00735433, 0.00730093, 0.00547162, 0.01189995, 0.00089

In [80]:
from sklearn.svm import SVC

In [81]:
classifier = SVC()

In [82]:
from sklearn.model_selection import GridSearchCV

In [84]:
params = {
    "C": 10.**np.arange(-2, 2)
}

In [90]:
learner = GridSearchCV(classifier, params, cv=5, n_jobs=-1)

In [91]:
learner.fit(X_selected, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([ 0.01,  0.1 ,  1.  , 10.  ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [93]:
pd.DataFrame(learner.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,29.498992,0.223867,2.968413,0.044256,0.01,{'C': 0.01},0.6986,0.7132,0.7146,0.7174,...,0.7126,0.007307,4,0.71555,0.71235,0.71555,0.7141,0.71675,0.71486,0.00151
1,23.858834,0.898593,2.567462,0.046021,0.1,{'C': 0.1},0.739,0.7456,0.7448,0.757,...,0.74656,0.005834,3,0.7564,0.75515,0.7576,0.7534,0.75905,0.75632,0.001949
2,46.912519,0.3092,2.299872,0.041903,1.0,{'C': 1.0},0.771,0.781,0.7698,0.7836,...,0.776,0.005444,1,0.858,0.85745,0.8599,0.8572,0.8621,0.85893,0.001846
3,79.709571,8.732287,1.751825,0.365318,10.0,{'C': 10.0},0.7538,0.764,0.757,0.7654,...,0.75808,0.005835,2,0.9065,0.906,0.9052,0.9057,0.90725,0.90613,0.000701
