In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# import libraries
import json
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from collections import defaultdict

In [4]:
#generator to open json.gzip files
# yields single lines
def get_reviews(file):
    '''
    Generator will yield lines of the passed file
    '''
    with gzip.open(file, 'r') as f:
        for l in f:
            yield l
    f.close()

In [6]:
# file names
training = '/content/gdrive/My Drive/nf_capstone/train_set_text_edit.json.gz'
validation = '/content/gdrive/My Drive/nf_capstone/validation_set_text_edit.json.gz'
test = '/content/gdrive/My Drive/nf_capstone/test_set_text_edit.json.gz'


In [7]:
# import spoiler reviews from training set
# features to isolate
features = ['sentence_labels', 'lemmatized']

reviews = get_reviews(training)

feature_dict = defaultdict(list)    

# import only reviews written in english and reviews containing spoilers

for review in tqdm(reviews):
    review_dict = json.loads(review)
    language = review_dict.get('review_language_start')
    spoiler = review_dict.get('has_spoiler')
    
    if language != 'en': 
        continue
    elif language == 'en' and spoiler == 0:
        continue
    else:
        for f in features:
            feature_dict[f].append(review_dict.get(f))
df_spoiler_train = pd.DataFrame.from_dict(feature_dict)

964623it [01:23, 11524.50it/s]


In [7]:
df_spoiler_train.head()

Unnamed: 0,sentence_labels,lemmatized
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[review originally post step fiction, want reb..."
1,"[0, 1, 1, 0]",[interesting volume learn kira power actually ...
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[actually surprised enjoy book, completely for..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...",[ok date comment read book go assume eloisa ja...
4,[1],[good book series partly fantastic element]


In [8]:
# isolate sentences and labels

sentences = []
for review in tqdm(df_spoiler_train['lemmatized']):
    for sentence in review:
        sentences.append(sentence)

labels = []
for review in tqdm(df_spoiler_train['sentence_labels']):
    for label in review:
        labels.append(label)


df_sentences = pd.DataFrame({
    'has_spoiler': labels,
    'sentences': sentences
})

100%|██████████| 58428/58428 [00:00<00:00, 157139.86it/s]
100%|██████████| 58428/58428 [00:00<00:00, 255223.45it/s]


In [9]:
df_sentences.shape

(1385613, 2)

In [10]:
# import libraries
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# for modeling
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier 


In [12]:
# perform initial tests on smaller dataset
from sklearn.model_selection import train_test_split

random_state = 42

X = df_sentences['sentences'].iloc[:100000]
y = df_sentences['has_spoiler'].iloc[:100000]

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = random_state)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.15, random_state=random_state)

In [13]:
models = {}

models['SGD'] = SGDClassifier()
models['LinearSVC'] = LinearSVC(random_state=random_state) 
models['LogReg'] = LogisticRegression(n_jobs=-1, random_state=random_state)
models['MultinomialNB'] = MultinomialNB(alpha = 1e-6)
models['Ridge'] = RidgeClassifier(random_state=random_state)
models['D_Tree_CLF'] = DecisionTreeClassifier(random_state=random_state)

In [36]:
def test_clfs(models, X_train, y_train):
    classifier = []
    recall = []
    precision = []
    roc_auc = []
    f1_macro = []
    fit_time = []
    accuracy = []
    
    for name, clf in models.items():
        scores = cross_validate(clf, X_train, y_train, scoring = ['recall', 'accuracy', 'precision', 'f1_macro', 'roc_auc'], 
                                 cv = 5)
        classifier.append(clf)
        
        fit_time.append(f"{round(np.mean(scores['fit_time']),3)} +/- {round(np.std(scores['fit_time']),3)}")
        recall.append(f"{round(np.mean(scores['test_recall']),3)} +/- {round(np.std(scores['test_recall']),3)}")
        precision.append(f"{round(np.mean(scores['test_precision']),3)} +/- {round(np.std(scores['test_precision']),3)}")
        roc_auc.append(f"{round(np.mean(scores['test_roc_auc']),3)} +/- {round(np.std(scores['test_roc_auc']),3)}")
        f1_macro.append(f"{round(np.mean(scores['test_f1_macro']),3)} +/- {round(np.std(scores['test_f1_macro']),3)}")
        accuracy.append(f"{round(np.mean(scores['test_accuracy']),3)} +/- {round(np.std(scores['test_accuracy']),3)}")
        
        print(f'Done with {name}')
        
    classification_result = pd.DataFrame({
        'classifier':classifier,
        'fit_time': fit_time,
        'accuracy': accuracy,
        'recall': recall,
        'precision': precision,
        'roc_auc': roc_auc,
        'f1_macro': f1_macro})
    return classification_result

In [37]:
# innitiate vectorizer using standard settings
vectorizer = TfidfVectorizer()

# fit the transformer
tfidf = vectorizer.fit(X_train)

X_train_vec = tfidf.transform(X_train)

test = test_clfs(models, X_train_vec, y_train)
test

Done with SGD
Done with LinearSVC
Done with LogReg
Done with MultinomialNB
Done with Ridge
Done with D_Tree_CLF


Unnamed: 0,classifier,fit_time,accuracy,recall,precision,roc_auc,f1_macro
0,"SGDClassifier(alpha=0.0001, average=False, cla...",0.141 +/- 0.008,0.742 +/- 0.001,0.071 +/- 0.004,0.698 +/- 0.026,0.709 +/- 0.002,0.488 +/- 0.003
1,"LinearSVC(C=1.0, class_weight=None, dual=True,...",0.414 +/- 0.014,0.743 +/- 0.003,0.298 +/- 0.008,0.541 +/- 0.01,0.706 +/- 0.003,0.611 +/- 0.005
2,"LogisticRegression(C=1.0, class_weight=None, d...",1.397 +/- 0.317,0.752 +/- 0.003,0.202 +/- 0.007,0.622 +/- 0.018,0.728 +/- 0.003,0.577 +/- 0.006
3,"MultinomialNB(alpha=1e-06, class_prior=None, f...",0.017 +/- 0.001,0.741 +/- 0.003,0.251 +/- 0.005,0.54 +/- 0.014,0.671 +/- 0.008,0.591 +/- 0.005
4,"RidgeClassifier(alpha=1.0, class_weight=None, ...",0.207 +/- 0.01,0.749 +/- 0.003,0.265 +/- 0.008,0.573 +/- 0.011,0.712 +/- 0.004,0.603 +/- 0.006
5,"DecisionTreeClassifier(ccp_alpha=0.0, class_we...",33.712 +/- 0.662,0.69 +/- 0.003,0.318 +/- 0.006,0.404 +/- 0.007,0.578 +/- 0.003,0.576 +/- 0.004


In [38]:
# innitiate vectorizer using standard settings
vectorizer = CountVectorizer(min_df = 50, stop_words = 'english',ngram_range=(1,2) )

# fit the transformer
cvec = vectorizer.fit(X_train)

X_train_vec = cvec.transform(X_train)

test = test_clfs(models, X_train_vec, y_train)
test

Done with SGD




Done with LinearSVC
Done with LogReg
Done with MultinomialNB
Done with Ridge
Done with D_Tree_CLF


Unnamed: 0,classifier,fit_time,accuracy,recall,precision,roc_auc,f1_macro
0,"SGDClassifier(alpha=0.0001, average=False, cla...",0.155 +/- 0.01,0.743 +/- 0.003,0.122 +/- 0.005,0.619 +/- 0.024,0.677 +/- 0.002,0.525 +/- 0.005
1,"LinearSVC(C=1.0, class_weight=None, dual=True,...",7.279 +/- 0.157,0.745 +/- 0.003,0.182 +/- 0.006,0.587 +/- 0.014,0.698 +/- 0.003,0.561 +/- 0.005
2,"LogisticRegression(C=1.0, class_weight=None, d...",0.935 +/- 0.013,0.745 +/- 0.003,0.195 +/- 0.009,0.579 +/- 0.014,0.7 +/- 0.004,0.568 +/- 0.007
3,"MultinomialNB(alpha=1e-06, class_prior=None, f...",0.015 +/- 0.0,0.735 +/- 0.004,0.322 +/- 0.009,0.511 +/- 0.01,0.706 +/- 0.004,0.612 +/- 0.006
4,"RidgeClassifier(alpha=1.0, class_weight=None, ...",0.131 +/- 0.002,0.745 +/- 0.003,0.169 +/- 0.009,0.593 +/- 0.017,0.697 +/- 0.004,0.555 +/- 0.007
5,"DecisionTreeClassifier(ccp_alpha=0.0, class_we...",7.743 +/- 0.168,0.68 +/- 0.003,0.293 +/- 0.01,0.378 +/- 0.007,0.589 +/- 0.008,0.56 +/- 0.004


In [14]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='not majority', k_neighbors=5, random_state = 42, n_jobs = -1)





In [45]:
vectorizer = CountVectorizer(min_df = 50, stop_words = 'english',ngram_range=(1,2) )

cvec = vectorizer.fit(X_train)

X_train_vec = cvec.transform(X_train)

X_res, y_res = sm.fit_resample(X_train_vec, y_train)



In [46]:

test = test_clfs(models, X_res, y_res)
test

Done with SGD




Done with LinearSVC




Done with LogReg
Done with MultinomialNB
Done with Ridge
Done with D_Tree_CLF


Unnamed: 0,classifier,fit_time,accuracy,recall,precision,roc_auc,f1_macro
0,"SGDClassifier(alpha=0.0001, average=False, cla...",0.201 +/- 0.018,0.694 +/- 0.076,0.81 +/- 0.195,0.65 +/- 0.037,0.76 +/- 0.101,0.686 +/- 0.071
1,"LinearSVC(C=1.0, class_weight=None, dual=True,...",12.696 +/- 0.504,0.704 +/- 0.078,0.806 +/- 0.183,0.662 +/- 0.041,0.755 +/- 0.087,0.698 +/- 0.075
2,"LogisticRegression(C=1.0, class_weight=None, d...",1.419 +/- 0.265,0.706 +/- 0.081,0.792 +/- 0.19,0.668 +/- 0.042,0.756 +/- 0.087,0.701 +/- 0.078
3,"MultinomialNB(alpha=1e-06, class_prior=None, f...",0.021 +/- 0.003,0.652 +/- 0.023,0.661 +/- 0.083,0.649 +/- 0.006,0.697 +/- 0.008,0.651 +/- 0.022
4,"RidgeClassifier(alpha=1.0, class_weight=None, ...",0.155 +/- 0.01,0.702 +/- 0.079,0.816 +/- 0.176,0.657 +/- 0.043,0.753 +/- 0.09,0.696 +/- 0.075
5,"DecisionTreeClassifier(ccp_alpha=0.0, class_we...",9.534 +/- 1.214,0.699 +/- 0.083,0.817 +/- 0.172,0.654 +/- 0.048,0.709 +/- 0.072,0.694 +/- 0.079


In [16]:
# for full dataset
X = df_sentences['sentences']
y = df_sentences['has_spoiler']

#for SMOTEing
vectorizer = CountVectorizer(min_df = 50, stop_words = 'english',ngram_range=(1,2) )
cvec = vectorizer.fit(X)
X_vec = cvec.transform(X)
X_res, y_res = sm.fit_resample(X_vec, y)






In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [67]:
from sklearn.calibration import CalibratedClassifierCV

pipeline = Pipeline([
    ('vec', CountVectorizer(min_df = 100, stop_words = 'english', ngram_range = (1,2))),
    #('Smote', SVMSMOTE(sampling_strategy=0.5, k_neighbors=5, random_state = 42, n_jobs = -1)),
    ('KBest', SelectKBest(chi2, 10000)),
    ('MNB', MultinomialNB())], verbose = True
)
pipeline2 = Pipeline([
    
    ('KBest', SelectKBest(chi2, 10000)),
    ('Ridge',RidgeClassifier(random_state=random_state))], verbose = True
)
pipeline3 = Pipeline([
    
    ('KBest', SelectKBest(chi2, 10000)),
    ('SGD',SGDClassifier())], verbose = True
)


In [21]:
#fit classifier/pipelines
MNB_full = pipeline.fit(X,y)
print('Done with MNB pipeline')
Ridge_smote = pipeline2.fit(X_res, y_res)
print('Done with Ridge pipeline')
SGD_SMOTE = pipeline3.fit(X_res, y_res)
print('Done with SGD')

[Pipeline] ............... (step 1 of 3) Processing vec, total=  45.2s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.4s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.3s
Done with MNB pipeline
[Pipeline] ............. (step 1 of 2) Processing KBest, total=   0.7s
[Pipeline] ............. (step 2 of 2) Processing Ridge, total=   7.2s
Done with Ridge pipeline
[Pipeline] ............. (step 1 of 2) Processing KBest, total=   0.7s
[Pipeline] ............... (step 2 of 2) Processing SGD, total=   6.4s
Done with SGD pipeline


In [74]:
X_vec

<1385613x20808 sparse matrix of type '<class 'numpy.int64'>'
	with 9305768 stored elements in Compressed Sparse Row format>

In [26]:
from sklearn.metrics import classification_report, confusion_matrix

In [27]:
# test multinomial Bayes on trainings data
confusion_matrix(y, prediction)

array([[862963, 154127],
       [212398, 156125]])

In [28]:
print(classification_report(y, prediction))

              precision    recall  f1-score   support

           0       0.80      0.85      0.82   1017090
           1       0.50      0.42      0.46    368523

    accuracy                           0.74   1385613
   macro avg       0.65      0.64      0.64   1385613
weighted avg       0.72      0.74      0.73   1385613



In [29]:
# test ridge classifier on trainings data 
confusion_matrix(y, prediction2)

array([[619095, 397995],
       [127595, 240928]])

In [30]:
print(classification_report(y, prediction2))

              precision    recall  f1-score   support

           0       0.83      0.61      0.70   1017090
           1       0.38      0.65      0.48    368523

    accuracy                           0.62   1385613
   macro avg       0.60      0.63      0.59   1385613
weighted avg       0.71      0.62      0.64   1385613



In [31]:
# test SGD classifier 
confusion_matrix(y, prediction3)

array([[591575, 425515],
       [125224, 243299]])

In [32]:
print(classification_report(y, prediction3))

              precision    recall  f1-score   support

           0       0.83      0.58      0.68   1017090
           1       0.36      0.66      0.47    368523

    accuracy                           0.60   1385613
   macro avg       0.59      0.62      0.58   1385613
weighted avg       0.70      0.60      0.63   1385613



In [33]:
# import spoiler reviews from validation set
# features to isolate
features = ['sentence_labels', 'lemmatized']

reviews = get_reviews(validation)

feature_dict = defaultdict(list)    

# import only reviews written in english and reviews containing spoilers

for review in tqdm(reviews):
    review_dict = json.loads(review)
    language = review_dict.get('review_language_start')
    spoiler = review_dict.get('has_spoiler')
    
    if language != 'en': 
        continue
    elif language == 'en' and spoiler == 0:
        continue
    else:
        for f in features:
            feature_dict[f].append(review_dict.get(f))
df_spoiler_val = pd.DataFrame.from_dict(feature_dict)

275606it [00:24, 11335.91it/s]


In [34]:
sentences_val = []
for review in tqdm(df_spoiler_val['lemmatized']):
    for sentence in review:
        sentences_val.append(sentence)

labels_val = []
for review in tqdm(df_spoiler_val['sentence_labels']):
    for label in review:
        labels_val.append(label)

100%|██████████| 16521/16521 [00:00<00:00, 168599.18it/s]
100%|██████████| 16521/16521 [00:00<00:00, 269866.25it/s]


In [36]:
X_val_vec = cvec.transform(sentences_val)
predict_val = MNB_full.predict(sentences_val)
predict_val2 = Ridge_smote.predict(X_val_vec)
predict_val2 = SGD_SMOTE.predict(X_val_vec)

In [37]:
confusion_matrix(labels_val, predict_val)

array([[241181,  42644],
       [ 61961,  41703]])

In [41]:
print(classification_report(labels_val, predict_val))

              precision    recall  f1-score   support

           0       0.80      0.85      0.82    283825
           1       0.49      0.40      0.44    103664

    accuracy                           0.73    387489
   macro avg       0.65      0.63      0.63    387489
weighted avg       0.72      0.73      0.72    387489



In [42]:
confusion_matrix(labels_val, predict_val2)


array([[163772, 120053],
       [ 35784,  67880]])

In [43]:
print(classification_report(labels_val, predict_val2))

              precision    recall  f1-score   support

           0       0.82      0.58      0.68    283825
           1       0.36      0.65      0.47    103664

    accuracy                           0.60    387489
   macro avg       0.59      0.62      0.57    387489
weighted avg       0.70      0.60      0.62    387489



In [None]:
# test SGD classifier on trainingsdata

In [None]:
# pipiline grid_search 
from sklearn.feature_selection import f_classif, mutual_info_classif

param_grid = {
    'vec__ngram_range':[(1,1),(1,2), (2,2),(1,3), (3,3)],
    #'KBest__score_func':[f_classif, chi2, mutual_info_classif],
    'KBest__k': [1000,5000,10000],
    'MNB__alpha': [1, 1e-3, 1e-6, 1e-9]
  }


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

grid_MNB = GridSearchCV(pipeline, param_grid = param_grid, cv = 3, scoring = 'recall', verbose = 7)
grid_MNB.fit(X,y)



Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 1) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 1), score=0.302, total=  12.5s
[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 1) ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.6s remaining:    0.0s


[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 1), score=0.298, total=  12.6s
[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 1) ............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   25.2s remaining:    0.0s


[Pipeline] ............... (step 1 of 3) Processing vec, total=   8.2s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 1), score=0.300, total=  13.0s
[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 2) ............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   38.2s remaining:    0.0s


[Pipeline] ............... (step 1 of 3) Processing vec, total=  32.5s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 2), score=0.320, total=  39.9s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s


[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 2) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=  30.3s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 2), score=0.313, total=  37.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.9min remaining:    0.0s


[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 2) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.7s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(1, 2), score=0.317, total=  37.2s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.6min remaining:    0.0s


[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(2, 2) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.1s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.2s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(2, 2), score=0.066, total=  30.6s
[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(2, 2) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.5s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.2s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(2, 2), score=0.066, total=  31.0s
[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(2, 2) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.1s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.2s
[Pipeline] ............... (step 3 of 3) Proc

ValueError: k should be >=0, <= n_features = 99; got 1000. Use k='all' to return all features.



[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(3, 3) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.3s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(3, 3), score=nan, total=  34.5s
[CV] KBest__k=1000, MNB__alpha=1, vec__ngram_range=(3, 3) ............


ValueError: k should be >=0, <= n_features = 103; got 1000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  33.9s
[CV]  KBest__k=1000, MNB__alpha=1, vec__ngram_range=(3, 3), score=nan, total=  34.1s
[CV] KBest__k=1000, MNB__alpha=0.001, vec__ngram_range=(1, 1) ........


ValueError: k should be >=0, <= n_features = 108; got 1000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=0.001, vec__ngram_range=(1, 1), score=0.303, total=  12.5s
[CV] KBest__k=1000, MNB__alpha=0.001, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=0.001, vec__ngram_range=(1, 1), score=0.298, total=  12.5s
[CV] KBest__k=1000, MNB__alpha=0.001, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=0.001

ValueError: k should be >=0, <= n_features = 99; got 1000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  33.7s
[CV]  KBest__k=1000, MNB__alpha=0.001, vec__ngram_range=(3, 3), score=nan, total=  33.8s


ValueError: k should be >=0, <= n_features = 103; got 1000. Use k='all' to return all features.



[CV] KBest__k=1000, MNB__alpha=0.001, vec__ngram_range=(3, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  33.7s
[CV]  KBest__k=1000, MNB__alpha=0.001, vec__ngram_range=(3, 3), score=nan, total=  33.8s


ValueError: k should be >=0, <= n_features = 108; got 1000. Use k='all' to return all features.



[CV] KBest__k=1000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1e-06, vec__ngram_range=(1, 1), score=0.303, total=  12.4s
[CV] KBest__k=1000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1e-06, vec__ngram_range=(1, 1), score=0.298, total=  12.4s
[CV] KBest__k=1000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   8.3s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of

ValueError: k should be >=0, <= n_features = 99; got 1000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  35.1s
[CV]  KBest__k=1000, MNB__alpha=1e-06, vec__ngram_range=(3, 3), score=nan, total=  35.2s
[CV] KBest__k=1000, MNB__alpha=1e-06, vec__ngram_range=(3, 3) ........


ValueError: k should be >=0, <= n_features = 103; got 1000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  36.2s
[CV]  KBest__k=1000, MNB__alpha=1e-06, vec__ngram_range=(3, 3), score=nan, total=  36.3s


ValueError: k should be >=0, <= n_features = 108; got 1000. Use k='all' to return all features.



[CV] KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   8.4s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(1, 1), score=0.303, total=  13.2s
[CV] KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(1, 1), score=0.298, total=  12.5s
[CV] KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.9s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of

ValueError: k should be >=0, <= n_features = 99; got 1000. Use k='all' to return all features.



[CV] KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(3, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.0s
[CV]  KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(3, 3), score=nan, total=  34.1s


ValueError: k should be >=0, <= n_features = 103; got 1000. Use k='all' to return all features.



[CV] KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(3, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.1s
[CV]  KBest__k=1000, MNB__alpha=1e-09, vec__ngram_range=(3, 3), score=nan, total=  34.2s


ValueError: k should be >=0, <= n_features = 108; got 1000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 1) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 1), score=0.348, total=  12.5s
[CV] KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 1) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 1), score=0.342, total=  12.5s
[CV] KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 1) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Proc

ValueError: k should be >=0, <= n_features = 2330; got 5000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  27.7s
[CV]  KBest__k=5000, MNB__alpha=1, vec__ngram_range=(2, 2), score=nan, total=  27.8s


ValueError: k should be >=0, <= n_features = 2346; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1, vec__ngram_range=(2, 2) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.8s
[CV]  KBest__k=5000, MNB__alpha=1, vec__ngram_range=(2, 2), score=nan, total=  25.9s


ValueError: k should be >=0, <= n_features = 2363; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 3) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.1min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 3), score=0.401, total= 1.2min
[CV] KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 3) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.1min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 3), score=0.394, total= 1.2min
[CV] KBest__k=5000, MNB__alpha=1, vec__ngram_range=(1, 3) ............
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.1min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Proc

ValueError: k should be >=0, <= n_features = 99; got 5000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  35.9s
[CV]  KBest__k=5000, MNB__alpha=1, vec__ngram_range=(3, 3), score=nan, total=  36.0s
[CV] KBest__k=5000, MNB__alpha=1, vec__ngram_range=(3, 3) ............


ValueError: k should be >=0, <= n_features = 103; got 5000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  35.8s
[CV]  KBest__k=5000, MNB__alpha=1, vec__ngram_range=(3, 3), score=nan, total=  35.9s


ValueError: k should be >=0, <= n_features = 108; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 1), score=0.349, total=  12.5s
[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 1), score=0.342, total=  12.6s
[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of

ValueError: k should be >=0, <= n_features = 2330; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(2, 2) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.4s
[CV]  KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(2, 2), score=nan, total=  25.5s


ValueError: k should be >=0, <= n_features = 2346; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(2, 2) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.6s
[CV]  KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(2, 2), score=nan, total=  25.7s
[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 3) ........


ValueError: k should be >=0, <= n_features = 2363; got 5000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.1min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 3), score=0.402, total= 1.3min
[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 3), score=0.395, total= 1.2min
[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(1, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=0.001

ValueError: k should be >=0, <= n_features = 99; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(3, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.4s
[CV]  KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(3, 3), score=nan, total=  34.5s
[CV] KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(3, 3) ........


ValueError: k should be >=0, <= n_features = 103; got 5000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  33.8s
[CV]  KBest__k=5000, MNB__alpha=0.001, vec__ngram_range=(3, 3), score=nan, total=  33.9s


ValueError: k should be >=0, <= n_features = 108; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 1), score=0.349, total=  12.4s
[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 1), score=0.342, total=  12.4s
[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of

ValueError: k should be >=0, <= n_features = 2330; got 5000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.1s
[CV]  KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(2, 2), score=nan, total=  25.2s
[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(2, 2) ........


ValueError: k should be >=0, <= n_features = 2346; got 5000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.5s
[CV]  KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(2, 2), score=nan, total=  25.6s


ValueError: k should be >=0, <= n_features = 2363; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 3), score=0.402, total= 1.2min
[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.1min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 3), score=0.395, total= 1.3min
[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(1, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of

ValueError: k should be >=0, <= n_features = 99; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(3, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  33.7s
[CV]  KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(3, 3), score=nan, total=  33.8s


ValueError: k should be >=0, <= n_features = 103; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(3, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.4s
[CV]  KBest__k=5000, MNB__alpha=1e-06, vec__ngram_range=(3, 3), score=nan, total=  34.6s


ValueError: k should be >=0, <= n_features = 108; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 1), score=0.349, total=  12.3s
[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 1), score=0.342, total=  12.4s
[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.9s
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of

ValueError: k should be >=0, <= n_features = 2330; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(2, 2) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.5s
[CV]  KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(2, 2), score=nan, total=  25.6s


ValueError: k should be >=0, <= n_features = 2346; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(2, 2) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.5s
[CV]  KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(2, 2), score=nan, total=  25.6s


ValueError: k should be >=0, <= n_features = 2363; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.2s
[CV]  KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 3), score=0.402, total= 1.2min
[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.1min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.6s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.5s
[CV]  KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 3), score=0.395, total= 1.2min
[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(1, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.1min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.3s
[Pipeline] ............... (step 3 of

ValueError: k should be >=0, <= n_features = 99; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(3, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.9s
[CV]  KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(3, 3), score=nan, total=  35.0s


ValueError: k should be >=0, <= n_features = 103; got 5000. Use k='all' to return all features.



[CV] KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(3, 3) ........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  35.1s
[CV]  KBest__k=5000, MNB__alpha=1e-09, vec__ngram_range=(3, 3), score=nan, total=  35.2s


ValueError: k should be >=0, <= n_features = 108; got 5000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 1) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 1), score=nan, total=   7.9s
[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 1) ...........


ValueError: k should be >=0, <= n_features = 5236; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 1), score=nan, total=   7.9s
[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 1) ...........


ValueError: k should be >=0, <= n_features = 5241; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 1), score=nan, total=   7.9s
[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 2) ...........


ValueError: k should be >=0, <= n_features = 5239; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  30.1s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 2), score=nan, total=  30.2s


ValueError: k should be >=0, <= n_features = 7566; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 2) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  30.8s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 2), score=nan, total=  30.9s


ValueError: k should be >=0, <= n_features = 7587; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 2) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  30.7s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 2), score=nan, total=  30.8s


ValueError: k should be >=0, <= n_features = 7602; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(2, 2) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.8s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(2, 2), score=nan, total=  25.9s


ValueError: k should be >=0, <= n_features = 2330; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(2, 2) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  26.1s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(2, 2), score=nan, total=  26.2s


ValueError: k should be >=0, <= n_features = 2346; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(2, 2) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.9s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(2, 2), score=nan, total=  26.0s


ValueError: k should be >=0, <= n_features = 2363; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 3) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7665; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 3) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7690; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 3) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7710; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(3, 3) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  38.2s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(3, 3), score=nan, total=  38.3s


ValueError: k should be >=0, <= n_features = 99; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(3, 3) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  35.1s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(3, 3), score=nan, total=  35.2s


ValueError: k should be >=0, <= n_features = 103; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1, vec__ngram_range=(3, 3) ...........
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.3s
[CV]  KBest__k=10000, MNB__alpha=1, vec__ngram_range=(3, 3), score=nan, total=  34.5s


ValueError: k should be >=0, <= n_features = 108; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 1) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 1), score=nan, total=   7.9s
[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 1) .......


ValueError: k should be >=0, <= n_features = 5236; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   8.1s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 1), score=nan, total=   8.2s
[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 1) .......


ValueError: k should be >=0, <= n_features = 5241; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 1), score=nan, total=   7.8s
[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 2) .......


ValueError: k should be >=0, <= n_features = 5239; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.8s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 2), score=nan, total=  29.9s
[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 2) .......


ValueError: k should be >=0, <= n_features = 7566; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.9s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 2), score=nan, total=  30.0s


ValueError: k should be >=0, <= n_features = 7587; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.5s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 2), score=nan, total=  29.6s


ValueError: k should be >=0, <= n_features = 7602; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(2, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.3s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(2, 2), score=nan, total=  25.4s
[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(2, 2) .......


ValueError: k should be >=0, <= n_features = 2330; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.6s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(2, 2), score=nan, total=  25.7s
[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(2, 2) .......


ValueError: k should be >=0, <= n_features = 2346; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.6s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(2, 2), score=nan, total=  25.7s


ValueError: k should be >=0, <= n_features = 2363; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.1min
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 3), score=nan, total= 1.1min


ValueError: k should be >=0, <= n_features = 7665; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 3), score=nan, total= 1.0min
[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 3) .......


ValueError: k should be >=0, <= n_features = 7690; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(1, 3), score=nan, total= 1.0min
[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(3, 3) .......


ValueError: k should be >=0, <= n_features = 7710; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.2s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(3, 3), score=nan, total=  34.3s


ValueError: k should be >=0, <= n_features = 99; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(3, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.3s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(3, 3), score=nan, total=  34.4s


ValueError: k should be >=0, <= n_features = 103; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(3, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.0s
[CV]  KBest__k=10000, MNB__alpha=0.001, vec__ngram_range=(3, 3), score=nan, total=  34.2s
[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) .......


ValueError: k should be >=0, <= n_features = 108; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 1), score=nan, total=   7.9s
[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) .......


ValueError: k should be >=0, <= n_features = 5236; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 1), score=nan, total=   7.8s
[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 1) .......


ValueError: k should be >=0, <= n_features = 5241; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.7s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 1), score=nan, total=   7.8s
[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 2) .......


ValueError: k should be >=0, <= n_features = 5239; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  33.2s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 2), score=nan, total=  33.3s


ValueError: k should be >=0, <= n_features = 7566; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.6s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 2), score=nan, total=  29.7s
[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 2) .......


ValueError: k should be >=0, <= n_features = 7587; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.8s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 2), score=nan, total=  29.9s


ValueError: k should be >=0, <= n_features = 7602; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(2, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.2s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(2, 2), score=nan, total=  25.4s


ValueError: k should be >=0, <= n_features = 2330; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(2, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.3s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(2, 2), score=nan, total=  25.5s


ValueError: k should be >=0, <= n_features = 2346; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(2, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.2s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(2, 2), score=nan, total=  25.4s
[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 3) .......


ValueError: k should be >=0, <= n_features = 2363; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7665; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7690; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7710; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(3, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.2s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(3, 3), score=nan, total=  34.4s
[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(3, 3) .......


ValueError: k should be >=0, <= n_features = 99; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.4s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(3, 3), score=nan, total=  34.5s


ValueError: k should be >=0, <= n_features = 103; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(3, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.2s
[CV]  KBest__k=10000, MNB__alpha=1e-06, vec__ngram_range=(3, 3), score=nan, total=  34.3s
[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) .......


ValueError: k should be >=0, <= n_features = 108; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 1), score=nan, total=   7.9s
[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) .......


ValueError: k should be >=0, <= n_features = 5236; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 1), score=nan, total=   7.9s
[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 1) .......


ValueError: k should be >=0, <= n_features = 5241; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=   7.8s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 1), score=nan, total=   7.9s
[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 2) .......


ValueError: k should be >=0, <= n_features = 5239; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  30.0s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 2), score=nan, total=  30.1s


ValueError: k should be >=0, <= n_features = 7566; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.6s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 2), score=nan, total=  29.7s


ValueError: k should be >=0, <= n_features = 7587; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.5s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 2), score=nan, total=  29.6s


ValueError: k should be >=0, <= n_features = 7602; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(2, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.2s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(2, 2), score=nan, total=  25.3s


ValueError: k should be >=0, <= n_features = 2330; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(2, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  25.6s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(2, 2), score=nan, total=  25.7s


ValueError: k should be >=0, <= n_features = 2346; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(2, 2) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  29.2s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(2, 2), score=nan, total=  29.3s


ValueError: k should be >=0, <= n_features = 2363; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7665; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7690; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.0min
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(1, 3), score=nan, total= 1.0min


ValueError: k should be >=0, <= n_features = 7710; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(3, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.5s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(3, 3), score=nan, total=  34.6s
[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(3, 3) .......


ValueError: k should be >=0, <= n_features = 99; got 10000. Use k='all' to return all features.



[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.5s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(3, 3), score=nan, total=  34.6s


ValueError: k should be >=0, <= n_features = 103; got 10000. Use k='all' to return all features.



[CV] KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(3, 3) .......
[Pipeline] ............... (step 1 of 3) Processing vec, total=  34.6s
[CV]  KBest__k=10000, MNB__alpha=1e-09, vec__ngram_range=(3, 3), score=nan, total=  34.7s


ValueError: k should be >=0, <= n_features = 108; got 10000. Use k='all' to return all features.

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 107.8min finished


[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.6min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.5s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.3s


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=100,
                                                        ngram_range=(1, 2),
                                                        pre

In [None]:
print('Best score:\n{:.2f}'.format(grid_MNB.best_score_))
print('Best parameters:\n{}'.format(grid_MNB.best_params_))

Best score:
0.40
Best parameters:
{'KBest__k': 5000, 'MNB__alpha': 0.001, 'vec__ngram_range': (1, 3)}


In [None]:
best_model = grid_MNB.best_estimator_

In [53]:
pipeline = Pipeline([
    ('vec', CountVectorizer(min_df = 100, stop_words = 'english', ngram_range = (1,3))),
    ('KBest', SelectKBest(chi2, k = 5000)),
    ('MNB', MultinomialNB(alpha = 0.001))], verbose = True
)

In [54]:
X = df_sentences['sentences']
y = df_sentences['has_spoiler']

mnb_model = pipeline.fit(X,y)

[Pipeline] ............... (step 1 of 3) Processing vec, total= 1.5min
[Pipeline] ............. (step 2 of 3) Processing KBest, total=   0.5s
[Pipeline] ............... (step 3 of 3) Processing MNB, total=   0.3s


the recall and hyper parameters of the best grid search model are very similar to the randomly chosen parameters. 

In [None]:
## Also add a classifier for whole reviews and not only sentences
## this work was manly pushed by Julia Schaefer



In [44]:
# load train data
features = ['has_spoiler', 'sentence_labels', 'lemmatized', 'best_genre']

reviews = get_reviews(training)

feature_dict = defaultdict(list)    

# import only reviews written in english and reviews containing spoilers

for review in tqdm(reviews):
    review_dict = json.loads(review)
    language = review_dict.get('review_language_start')

    
    if language != 'en': 
        continue
    else:
        for f in features:
            feature_dict[f].append(review_dict.get(f))
df_train_reviews = pd.DataFrame.from_dict(feature_dict)

964623it [01:42, 9455.22it/s] 


In [45]:
# down sample data to account for imbalance data

df_safe = df_train_reviews[df_train_reviews['has_spoiler']== False]
df_spoiler = df_train_reviews[df_train_reviews['has_spoiler']== True]

In [46]:
# number of samples in minority class
min_size = df_spoiler.shape[0]
df_safe = df_safe.sample(min_size, random_state = 42)


In [48]:
# concatenate and shuffle both dataframe
df_train_reviews_sample = pd.concat([df_spoiler,df_safe ])
# check sampling
np.unique(df_train_reviews_sample['has_spoiler'], return_counts = True)

(array([False,  True]), array([58428, 58428]))

In [58]:
#Pipeline
pipe_sgd = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 100)),
                 ('sgd', SGDClassifier(random_state = 42, penalty = 'l2', shuffle = True, n_jobs = -1, max_iter = 1000, 
                                       loss = 'hinge', class_weight = {0: 0.4, 1: .6}, alpha = .0001)),
                     ('cal',CalibratedClassifierCV())¶], verbose = True)

In [59]:
# fit sgd model
x_review = df_train_reviews_sample['lemmatized'].apply(lambda x: ' '.join(x))
SGD_reviews = pipe_sgd.fit(x_review,df_train_reviews_sample['has_spoiler'])

[Pipeline] ............. (step 1 of 2) Processing tfidf, total= 1.1min
[Pipeline] ............... (step 2 of 2) Processing sgd, total=   0.9s


## Test single classifier on whole reviews


In [60]:
# load train data
features = ['has_spoiler', 'sentence_labels', 'lemmatized', 'best_genre']

reviews = get_reviews(training)

feature_dict = defaultdict(list)    

# import only reviews written in english and reviews containing spoilers

for review in tqdm(reviews):
    review_dict = json.loads(review)
    language = review_dict.get('review_language_start')

    
    if language != 'en': 
        continue
    else:
        for f in features:
            feature_dict[f].append(review_dict.get(f))
df_train = pd.DataFrame.from_dict(feature_dict)

964623it [01:38, 9820.31it/s] 


In [35]:
# test predictions on whole training set 
# go through every review and get probability for every sentence
# then get max probability

In [36]:
df_train.shape

(893695, 4)

In [83]:
# predict spoiler for every sentence in review
# get the max probability

#proba_mnb_sentence = []
#proba_SGD_SMOTE = []
#proba_Ridge_smote = []
#proba_SGD_review = []

spoiler_predicted = []
for review in df_train['lemmatized']:
  proba_mnb_sentence.append(np.max(mnb_model.predict_proba(review).T[1]))
  
  #vectorize sentences
  vectors = cvec.transform(review)
  proba_SGD_SMOTE.append(np.max(SGD_SMOTE.decision_function(vectors)))
  proba_Ridge_smote.append(np.max(Ridge_smote.decision_function(vectors)))

#for prediction of review probability

X_train = df_train['lemmatized'].apply(lambda x: ' '.join(x))
proba_SGD_review = SGD_reviews.decision_function(X_train)

In [91]:
df_probabilities = pd.DataFrame({
    'mnb_sentence_max': proba_mnb_sentence,
    'SGD_sentence_max' : proba_SGD_SMOTE,
    'Ridge_sentence_max' : proba_Ridge_smote,
    'SGD_review' : proba_SGD_review,
    'has_spoiler' : df_train['has_spoiler']
})


In [92]:
df_probabilities.to_json('/content/gdrive/My Drive/nf_capstone/training_clfs_for_stacking.json.gz', lines = True, orient = 'records')

In [94]:
## repeat for validation and test set
features = ['has_spoiler', 'sentence_labels', 'lemmatized', 'best_genre']

reviews = get_reviews(validation)

feature_dict = defaultdict(list)    

# import only reviews written in english and reviews containing spoilers

for review in reviews:
    review_dict = json.loads(review)
    language = review_dict.get('review_language_start')

    
    if language != 'en': 
        continue
    else:
        for f in features:
            feature_dict[f].append(review_dict.get(f))
df_val = pd.DataFrame.from_dict(feature_dict)

# predict spoiler for every sentence in review
# get the max probability

proba_mnb_sentence = []
proba_SGD_SMOTE = []
proba_Ridge_smote = []
proba_SGD_review = []

spoiler_predicted = []

for review in df_val['lemmatized']:
  proba_mnb_sentence.append(np.max(mnb_model.predict_proba(review).T[1]))
  
#vectorize sentences
  vectors = cvec.transform(review)
  proba_SGD_SMOTE.append(np.max(SGD_SMOTE.decision_function(vectors)))
  proba_Ridge_smote.append(np.max(Ridge_smote.decision_function(vectors)))

#for prediction of review probability

X_val = df_val['lemmatized'].apply(lambda x: ' '.join(x))
proba_SGD_review = SGD_reviews.decision_function(X_val)

# save to dataframe
df_probabilities = pd.DataFrame({
    'mnb_sentence_max': proba_mnb_sentence,
    'SGD_sentence_max' : proba_SGD_SMOTE,
    'Ridge_sentence_max' : proba_Ridge_smote,
    'SGD_review' : proba_SGD_review,
    'has_spoiler' : df_val['has_spoiler']
})

df_probabilities.to_json('/content/gdrive/My Drive/nf_capstone/validation_clfs_for_stacking.json.gz', lines = True, orient = 'records')

In [100]:
## repeat for validation and test set
features = ['has_spoiler', 'sentence_labels', 'lemmatized', 'best_genre']

reviews = get_reviews(test)

feature_dict = defaultdict(list)    

# import only reviews written in english and reviews containing spoilers

for review in reviews:
    review_dict = json.loads(review)
    language = review_dict.get('review_language_start')

    
    if language != 'en': 
        continue
    else:
        for f in features:
            feature_dict[f].append(review_dict.get(f))
df_test = pd.DataFrame.from_dict(feature_dict)

# predict spoiler for every sentence in review
# get the max probability

proba_mnb_sentence = []
proba_SGD_SMOTE = []
proba_Ridge_smote = []
proba_SGD_review = []

spoiler_predicted = []

for review in df_test['lemmatized']:
  proba_mnb_sentence.append(np.max(mnb_model.predict_proba(review).T[1]))
  
#vectorize sentences
  vectors = cvec.transform(review)
  proba_SGD_SMOTE.append(np.max(SGD_SMOTE.decision_function(vectors)))
  proba_Ridge_smote.append(np.max(Ridge_smote.decision_function(vectors)))

#for prediction of review probability

X_test = df_test['lemmatized'].apply(lambda x: ' '.join(x))
proba_SGD_review = SGD_reviews.decision_function(X_test)

# save to dataframe
df_probabilities = pd.DataFrame({
    'mnb_sentence_max': proba_mnb_sentence,
    'SGD_sentence_max' : proba_SGD_SMOTE,
    'Ridge_sentence_max' : proba_Ridge_smote,
    'SGD_review' : proba_SGD_review,
    'has_spoiler' : df_test['has_spoiler']
})

df_probabilities.to_json('/content/gdrive/My Drive/nf_capstone/test_clfs_for_stacking.json.gz', lines = True, orient = 'records')

NameError: ignored

In [97]:
df_train.head()

Unnamed: 0,has_spoiler,sentence_labels,lemmatized,best_genre
0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[read review blog, , definitely well book, ins...",young-adult
1,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[write comment realize probably end long quali...,fiction
2,False,"[0, 0, 0, 0]","[charlie turn young sister get marry, decide w...",romance
3,False,"[0, 0, 0, 0]",[like get implausible storyline read long time...,romance
4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[review originally post step fiction, want reb...",young-adult


In [None]:
prob_spoiler = []
for prob in probas:
    spoiler_prob = []
    for safe, spoiler in prob:
        spoiler_prob.append(spoiler)
    prob_spoiler.append(spoiler_prob)

In [None]:
#df_train['sentence_spoiler_prob'] = prob_spoiler
df_train['max_proba_spoiler'] = df_train['sentence_spoiler_prob'].apply(lambda x: np.max(x))

In [None]:
def prediction_spoiler(proba, threshold):
    if proba > threshold:
        return True
    else:
        return False

In [None]:
df_train.head()

Unnamed: 0,has_spoiler,sentence_labels,lemmatized,best_genre
0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[read review blog, , definitely well book, ins...",young-adult
1,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[write comment realize probably end long quali...,fiction
2,False,"[0, 0, 0, 0]","[charlie turn young sister get marry, decide w...",romance
3,False,"[0, 0, 0, 0]",[like get implausible storyline read long time...,romance
4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[review originally post step fiction, want reb...",young-adult


In [None]:
#precision recall curve
precision, recall, threshold = precision_recall_curve(df_val['has_spoiler'],df_val['max_probability'] )

figure = px.line(x = recall, y = precision)
figure.show()

In [None]:
df_train['prediction'] =  df_train['max_proba_spoiler'].apply(lambda x: prediction_spoiler(x, 0.75))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

confusion_matrix(df_train['has_spoiler'],df_train['prediction'])

array([[598646, 236621],
       [ 16590,  41838]])

In [None]:
print(classification_report(df_train['has_spoiler'],df_train['prediction']))

              precision    recall  f1-score   support

       False       0.97      0.72      0.83    835267
        True       0.15      0.72      0.25     58428

    accuracy                           0.72    893695
   macro avg       0.56      0.72      0.54    893695
weighted avg       0.92      0.72      0.79    893695



In [None]:
df_train['percent_spoiler'] = df_train['sentence_prediction'].apply(lambda x : np.mean(x))

In [None]:
import pickle

pickle.dump(mnb_model, open('mnb_model', 'wb'))

#to load model
#mnb_model = pickle.load(open('mnb_model', 'rb'))

In [None]:
df_train.to_json('train_mnb.json.gz', orient = 'records', lines = True, compression = 'gzip'  )

In [None]:
# reimport prediction data

data = []    

# import only reviews written in english and reviews containing spoilers
mnb_data = get_reviews('train_mnb.json.gz')

for review in tqdm(mnb_data):
    review_dict = json.loads(review)
    data.append(review_dict)
        
df_mnb = pd.DataFrame(data)

893695it [02:10, 6838.33it/s] 


In [None]:
# isolate features
df_mnb2 = df_mnb[['max_proba_spoiler', 'percent_spoiler', 'sentence_prediction','sentence_probability',\
                  'sentence_spoiler_prob', 'prediction']]


In [None]:
# put together with text data
df_train2 = pd.concat([df_train, df_mnb2], axis = 1)

In [None]:
# calculate length of review (number of sentences)
df_train2['review_length'] = df_train2['lemmatized'].apply(lambda x: len(x))

In [None]:
df_train2.head()

Unnamed: 0,has_spoiler,sentence_labels,lemmatized,best_genre,max_proba_spoiler,percent_spoiler,sentence_prediction,sentence_probability,sentence_spoiler_prob,prediction,review_length
0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[read review blog, , definitely well book, ins...",young-adult,0.898108,0.208333,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, ...","[[0.9999697567, 3.02433e-05], [0.734036127, 0....","[3.02433e-05, 0.265963873, 0.0225331695, 0.402...",True,24
1,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[write comment realize probably end long quali...,fiction,0.922497,0.138889,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...","[[0.9950245448, 0.0049754552], [0.5567434137, ...","[0.0049754552, 0.4432565863, 0.2970407731, 0.1...",True,36
2,False,"[0, 0, 0, 0]","[charlie turn young sister get marry, decide w...",romance,0.815052,0.25,"[1, 0, 0, 0]","[[0.1849484974, 0.8150515026], [0.9826921784, ...","[0.8150515026, 0.0173078216, 0.2000061878, 0.0...",True,4
3,False,"[0, 0, 0, 0]",[like get implausible storyline read long time...,romance,0.175878,0.0,"[0, 0, 0, 0]","[[0.997293274, 0.002706726], [0.9998326138, 0....","[0.002706726, 0.0001673862, 0.0783459275, 0.17...",False,4
4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[review originally post step fiction, want reb...",young-adult,0.789537,0.27027,"[0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...","[[1.0, 3.040437365e-18], [0.698857291, 0.30114...","[3.040437365e-18, 0.301142709, 0.1415903485, 0...",True,37


In [None]:
X_meta = df_train2[['best_genre','max_proba_spoiler', 'percent_spoiler', 'review_length']]
y_meta = df_train2['has_spoiler']

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(X_meta['best_genre'])
X_meta['best_genre'] = le.transform(X_meta['best_genre'])




In [None]:
X_meta.head()

Unnamed: 0,best_genre,max_proba_spoiler,percent_spoiler,review_length
0,9,0.898108,0.208333,24
1,3,0.922497,0.138889,36
2,8,0.815052,0.25,4
3,8,0.175878,0.0,4
4,9,0.789537,0.27027,37


In [18]:
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier 
import xgboost as xgb
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier

random_state = 42

models_meta = {}

models_meta['SGD'] = SGDClassifier()
models_meta['LinearSVC'] = LinearSVC(random_state=random_state) 
models_meta['LogReg'] = LogisticRegression(n_jobs=-1, random_state=random_state)
models_meta['MultinomialNB'] = MultinomialNB(alpha = 1e-6)
models_meta['Ridge'] = RidgeClassifier(random_state=random_state)
models_meta['D_Tree_CLF'] = DecisionTreeClassifier(random_state=random_state)
models_meta['Random_forrest']= RandomForestClassifier()
models_meta['GradientBoostingClassifier'] = GradientBoostingClassifier()

models_meta['Bagging_LR'] = BaggingClassifier(base_estimator =LogisticRegression(), n_estimators = 20 )
models_meta['Bagging_Tree'] = BaggingClassifier(base_estimator =DecisionTreeClassifier(max_depth = 20), n_estimators = 10 )
models_meta['XGBoost'] = xgb.XGBClassifier()
models_meta['Adaboost_Tree'] = AdaBoostClassifier()

In [19]:
test_clfs(models_meta, X_meta, y_meta)

NameError: ignored