In [73]:
import fasttext
import time
import json
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

In [12]:
# import vectors
model_vec = fasttext.load_model('emb_model.vec')

In [75]:
#generator to open json.gzip files
# yields single lines
def get_reviews(file):
    '''
    Generator will yield lines of the passed file
    '''
    with gzip.open(file, 'r') as f:
        for l in f:
            yield l
    f.close()
# fetch features
def fetch_features(file, features):
    '''
    Provide a list of features you want to extract in a single run.
    Returns a dictionary.
    file: json.gzip file you want to scan
    features: features to extract from data
    only reviews written in english will be collected
    '''
    
    feature_dict = defaultdict(list)    
    reviews = get_reviews(file)
    
    for review in tqdm(reviews):
        review_dict = json.loads(review)
        language = review_dict.get('review_language_start')

        if language != 'en': 
            continue
        
        else:
            for f in features:
                feature_dict[f].append(review_dict.get(f))
        
        
    return pd.DataFrame.from_dict(feature_dict)


In [76]:
training = 'train_set_text_edit.json.gz'
validation = 'validation_set_text_edit.json.gz'
test = 'test_set_text_edit.json.gz'

In [77]:
features = ['has_spoiler','sentence_labels', 'sentence_text_spellchecked', 'lemmatized', 'best_genre']
df_train = fetch_features(training, features)

964623it [07:48, 2060.50it/s]


In [33]:
df_train.head()

Unnamed: 0,has_spoiler,sentence_text_spellchecked,lemmatized,best_genre
0,False,"[read this review on my blog, , definitely bet...","[read review blog, , definitely well book, ins...",young-adult
1,False,[i was writing a comment that i realized would...,[write comment realize probably end long quali...,fiction
2,False,[charlie is turning and her younger sister is ...,"[charlie turn young sister get marry, decide w...",romance
3,False,[is more like it even though this has got to b...,[like get implausible storyline read long time...,romance
4,True,[review originally posted at step into fiction...,"[review originally post step fiction, want reb...",young-adult


In [8]:
df_train['sentence_text_spellchecked'] = df_train['sentence_text_spellchecked'].apply(lambda x: ' '.join(x))

In [15]:
df_train.head()

Unnamed: 0,has_spoiler,sentence_text_spellchecked
0,False,read this review on my blog definitely better...
1,False,i was writing a comment that i realized would ...
2,False,charlie is turning and her younger sister is g...
3,False,is more like it even though this has got to be...
4,True,review originally posted at step into fiction ...


In [22]:
from sklearn.model_selection import train_test_split
random_state = 42

#X=df_train['sentence_text_spellchecked']
#y = df_train['has_spoiler']
#
#X_train, X_val, y_train, y_val = train_test_split(X,y,random_state = random_state, stratify= y)
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.15, random_state=random_state, stratify= y_train)
#

## Text Vectorization

In [79]:
# dimension of word vectors
print(model_vec.get_dimension())

100


In [49]:
# et sentence vectors for reviews
list_of_sent_vec = []
for sentence in tqdm(X_train):
    vec = model_vec.get_sentence_vector(sentence)
    list_of_sent_vec.append(vec)

    
    

100%|██████████| 569730/569730 [10:05<00:00, 940.44it/s] 


In [50]:
list_col = tuple(range(100))
df_vec = pd.DataFrame(data =list_of_sent_vec, columns = list_col )

In [90]:
df_vec.shape

(569730, 100)

In [41]:
from sklearn import metrics
from sklearn.model_selection import cross_validate

from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier 
#import xgboost as xgb
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier

In [72]:
models2 = dict()
models2['SGD'] = SGDClassifier(n_jobs = -1)
models2['LinearSVC'] = LinearSVC(random_state=random_state, max_iter = 5000) 
models2['LogReg'] = LogisticRegression(n_jobs=-1, random_state=random_state)
models2['KNN'] = KNeighborsClassifier()
models2['Ridge'] = RidgeClassifier(random_state=random_state)
models2['D_Tree_CLF'] = DecisionTreeClassifier(max_depth = 50, random_state=random_state)
models2['Perceptron'] = Perceptron(n_jobs = -1)
models2['PA_CLF'] = PassiveAggressiveClassifier(n_jobs = -1)
models2['ExtraTree'] = ExtraTreesClassifier(max_depth = 50)
models2['RandomForrest'] = RandomForestClassifier()
models2['Bagging_LR'] = BaggingClassifier(base_estimator =LogisticRegression(), n_estimators = 20 )
models2['Bagging_Tree'] = BaggingClassifier(base_estimator =DecisionTreeClassifier(max_depth = 20), n_estimators = 10 )
models2['Adaboost_LR'] = AdaBoostClassifier(base_estimator =LogisticRegression())
models2['Adaboost_Tree'] = AdaBoostClassifier()

In [45]:
def test_clfs(models, X_train, y_train):
    classifier = []
    recall = []
    precision = []
    roc_auc = []
    f1_macro = []
    fit_time = []
    accuracy = []
    
    for name, clf in (models.items()):
        scores = cross_validate(clf, X_train, y_train, scoring = ['recall', 'accuracy', 'precision', 'f1_macro', 'roc_auc'], 
                                 cv = 5)
        classifier.append(clf)
        
        fit_time.append(f"{round(np.mean(scores['fit_time']),3)} +/- {round(np.std(scores['fit_time']),3)}")
        recall.append(f"{round(np.mean(scores['test_recall']),3)} +/- {round(np.std(scores['test_recall']),3)}")
        precision.append(f"{round(np.mean(scores['test_precision']),3)} +/- {round(np.std(scores['test_precision']),3)}")
        roc_auc.append(f"{round(np.mean(scores['test_roc_auc']),3)} +/- {round(np.std(scores['test_roc_auc']),3)}")
        f1_macro.append(f"{round(np.mean(scores['test_f1_macro']),3)} +/- {round(np.std(scores['test_f1_macro']),3)}")
        accuracy.append(f"{round(np.mean(scores['test_accuracy']),3)} +/- {round(np.std(scores['test_accuracy']),3)}")
        
        print(f'Done with {name}')
        
    classification_result = pd.DataFrame({
        'classifier':classifier,
        'fit_time': fit_time,
        'accuracy': accuracy,
        'recall': recall,
        'precision': precision,
        'roc_auc': roc_auc,
        'f1_macro': f1_macro})
    return classification_result

In [73]:
test_clfs(models2, df_vec, y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  7%|▋         | 1/14 [00:12<02:45, 12.72s/it]

Done with SGD


 14%|█▍        | 2/14 [01:31<06:30, 32.55s/it]

Done with LinearSVC


 21%|██▏       | 3/14 [23:16<1:15:55, 414.16s/it]

Done with LogReg


  _warn_prf(average, modifier, msg_start, len(result))
 29%|██▊       | 4/14 [23:28<48:55, 293.56s/it]  

Done with Ridge


 36%|███▌      | 5/14 [34:09<59:41, 397.90s/it]

Done with D_Tree_CLF


 43%|████▎     | 6/14 [34:19<37:31, 281.42s/it]

Done with Perceptron


 50%|█████     | 7/14 [34:29<23:21, 200.20s/it]

Done with PA_CLF


 57%|█████▋    | 8/14 [50:31<42:51, 428.54s/it]

Done with ExtraTree


 64%|██████▍   | 9/14 [1:53:24<1:59:19, 1431.81s/it]

Done with RandomForrest


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Done with Bagging_LR


 79%|███████▊  | 11/14 [3:09:06<1:38:48, 1976.12s/it]

Done with Bagging_Tree


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 86%|████████▌ | 12/14 [3:59:28<1:16:19, 2289.78s/it]

Done with XGBoost


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
 93%|█████████▎| 13/14 [4:05:57<28:39, 1719.61s/it]  

Done with Adaboost_LR


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 14/14 [4:43:55<00:00, 1216.81s/it]

Done with Adaboost_Tree





Unnamed: 0,classifier,fit_time,accuracy,recall,precision,roc_auc,f1_macro
0,SGDClassifier(n_jobs=-1),2.279 +/- 0.607,0.935 +/- 0.0,0.0 +/- 0.0,0.0 +/- 0.0,0.761 +/- 0.002,0.483 +/- 0.0
1,"LinearSVC(max_iter=5000, random_state=42)",15.515 +/- 0.49,0.934 +/- 0.0,0.0 +/- 0.0,0.0 +/- 0.0,0.768 +/- 0.001,0.483 +/- 0.0
2,"LogisticRegression(n_jobs=-1, random_state=42)",260.413 +/- 19.064,0.934 +/- 0.0,0.0 +/- 0.0,0.023 +/- 0.011,0.762 +/- 0.001,0.483 +/- 0.0
3,RidgeClassifier(random_state=42),2.134 +/- 0.319,0.935 +/- 0.0,0.0 +/- 0.0,0.0 +/- 0.0,0.759 +/- 0.001,0.483 +/- 0.0
4,"DecisionTreeClassifier(max_depth=50, random_st...",127.962 +/- 6.832,0.874 +/- 0.001,0.166 +/- 0.003,0.131 +/- 0.002,0.544 +/- 0.001,0.539 +/- 0.001
5,Perceptron(n_jobs=-1),1.691 +/- 0.075,0.798 +/- 0.134,0.378 +/- 0.327,0.131 +/- 0.049,0.76 +/- 0.002,0.515 +/- 0.036
6,PassiveAggressiveClassifier(n_jobs=-1),1.912 +/- 0.078,0.854 +/- 0.16,0.175 +/- 0.349,0.039 +/- 0.041,0.763 +/- 0.001,0.473 +/- 0.02
7,ExtraTreesClassifier(max_depth=50),180.813 +/- 0.68,0.935 +/- 0.0,0.004 +/- 0.001,0.992 +/- 0.016,0.798 +/- 0.002,0.487 +/- 0.001
8,RandomForestClassifier(),745.914 +/- 5.133,0.935 +/- 0.0,0.004 +/- 0.0,0.924 +/- 0.007,0.772 +/- 0.001,0.487 +/- 0.0
9,BaggingClassifier(base_estimator=LogisticRegre...,193.574 +/- 2.8,0.934 +/- 0.0,0.0 +/- 0.0,0.02 +/- 0.009,0.762 +/- 0.001,0.483 +/- 0.0


In [78]:
# try classification of spoiler sentences vs safe sentences from spoiler reviews

# isolate spoiler reviews
df_spoiler = df_train[df_train['has_spoiler']==True]

#isolate labels sentences from spoiler reviews
sentences = []

for review in tqdm(df_spoiler2['sentence_text_spellchecked']):
    for sentence in review:
        sentences.append(sentence)

labels = []
for review in tqdm(df_spoiler2['sentence_labels']):
    for label in review:
        labels.append(label)


df_sentences = pd.DataFrame({
    'has_spoiler': labels,
    'sentences': sentences
})

# get sentence vectors from fasttext model
list_of_sent_vec = []
for sentence in tqdm(df_sentences['sentences']):
    vec = model_vec.get_sentence_vector(sentence)
    list_of_sent_vec.append(vec)
# write fasttext vector representation of sentence to dataframe    
list_col = tuple(range(100))
df_vec2 = pd.DataFrame(data =list_of_sent_vec2, columns = list_col )
# test different classifiers on the dataset
test_clfs(models2, df_vec2, df_sentences2['has_spoiler'])

100%|██████████| 1385613/1385613 [02:31<00:00, 9148.46it/s] 
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  7%|▋         | 1/14 [01:15<16:15, 75.00s/it]

Done with SGD


 14%|█▍        | 2/14 [06:38<29:55, 149.62s/it]

Done with LinearSVC


 14%|█▍        | 2/14 [32:01<3:12:06, 960.56s/it]


KeyboardInterrupt: 

## using fasttext pretrained models

In [74]:
# download model
import fasttext.util
#fasttext.util.download_model('en', if_exists='ignore');  # English
ft = fasttext.load_model('cc.en.300.bin')

In [78]:
df_train['lemmatized'] = df_train['lemmatized'].apply(lambda x: ' '.join(x))

In [79]:
df_train.head()

Unnamed: 0,has_spoiler,sentence_labels,sentence_text_spellchecked,lemmatized,best_genre
0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[read this review on my blog, , definitely bet...",read review blog definitely well book insurge...,young-adult
1,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[i was writing a comment that i realized would...,write comment realize probably end long qualif...,fiction
2,False,"[0, 0, 0, 0]",[charlie is turning and her younger sister is ...,charlie turn young sister get marry decide wri...,romance
3,False,"[0, 0, 0, 0]",[is more like it even though this has got to b...,like get implausible storyline read long time ...,romance
4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[review originally posted at step into fiction...,review originally post step fiction want reboo...,young-adult


In [81]:
# as the pretrained embedding vectors have a dimension of 300 we will set up the dataframe as follows
# first test with a genre specific datase
df_crime = df_train[df_train['best_genre']=='mystery, thriller, crime']
df_crime['lemmatized'] = df_crime['lemmatized'].apply(lambda x: ' '.join(x))
#df_train_vec = pd.DataFrame(columns = range(300))
#for review in tqdm(df_crime['lemmatized']):
#    vec = ft.get_sentence_vector(review)
#    df_train_vec = df_train_vec.append(pd.Series(vec), ignore_index=True )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_crime['lemmatized'] = df_crime['lemmatized'].apply(lambda x: ' '.join(x))


In [38]:
print(df_crime.shape)
print(df_train_vec.shape)

(48194, 4)
(48194, 300)


In [39]:
df_train_vec.to_csv('crime_reviews_vectors.csv')

In [40]:
df_crime.to_json('crime_reviews.json')

In [82]:
from sklearn.neighbors import KNeighborsClassifier
random_state = 42

models2 = dict()
models2['SGD'] = SGDClassifier(n_jobs = -1)
models2['LinearSVC'] = LinearSVC(random_state=random_state, max_iter = 5000) 
models2['LogReg'] = LogisticRegression(n_jobs=-1, random_state=random_state)
#models2['MultinomialNB'] = MultinomialNB(alpha = 1e-6)
models2['Ridge'] = RidgeClassifier(random_state=random_state)
models2['D_Tree_CLF'] = DecisionTreeClassifier(max_depth = 50, random_state=random_state)
models2['KNN'] = KNeighborsClassifier()


In [61]:
test_clfs(models2,df_train_vec, df_crime['has_spoiler'] )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Done with SGD


  _warn_prf(average, modifier, msg_start, len(result))


Done with LinearSVC
Done with LogReg


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Done with Ridge
Done with D_Tree_CLF
Done with KNN


Unnamed: 0,classifier,fit_time,accuracy,recall,precision,roc_auc,f1_macro
0,SGDClassifier(n_jobs=-1),0.415 +/- 0.051,0.925 +/- 0.0,0.0 +/- 0.0,0.0 +/- 0.0,0.759 +/- 0.008,0.48 +/- 0.0
1,"LinearSVC(max_iter=5000, random_state=42)",1.718 +/- 0.043,0.924 +/- 0.0,0.0 +/- 0.0,0.0 +/- 0.0,0.767 +/- 0.008,0.48 +/- 0.0
2,"LogisticRegression(n_jobs=-1, random_state=42)",2.367 +/- 0.614,0.924 +/- 0.0,0.0 +/- 0.001,0.029 +/- 0.057,0.753 +/- 0.007,0.481 +/- 0.001
3,RidgeClassifier(random_state=42),0.212 +/- 0.018,0.925 +/- 0.0,0.0 +/- 0.0,0.0 +/- 0.0,0.764 +/- 0.007,0.48 +/- 0.0
4,"DecisionTreeClassifier(max_depth=50, random_st...",16.162 +/- 1.49,0.854 +/- 0.002,0.175 +/- 0.013,0.136 +/- 0.01,0.542 +/- 0.007,0.537 +/- 0.006
5,KNeighborsClassifier(),1.149 +/- 0.181,0.895 +/- 0.002,0.159 +/- 0.009,0.224 +/- 0.015,0.674 +/- 0.01,0.565 +/- 0.006


In [58]:
df_crime.head()

Unnamed: 0,has_spoiler,sentence_labels,sentence_text_spellchecked,lemmatized,best_genre
36,False,"[0, 0, 0, 0]","[holy cow, i need to read everything by this a...",holy cow need read author head ghost suck end ...,"mystery, thriller, crime"
41,True,[1],[the best one of the three book series partly ...,good book series partly fantastic element,"mystery, thriller, crime"
79,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[sasha stone knows her place first chair clari...,sasha stone know place chair clarinet class ox...,"mystery, thriller, crime"
101,False,"[0, 0]",[it was a pretty riveting story if oddly predi...,pretty riveting story oddly predictably remini...,"mystery, thriller, crime"
107,False,[0],[so strange but flynn is writing is always ama...,strange flynn writing amazing,"mystery, thriller, crime"


In [84]:
# try classification of spoiler sentences vs safe sentences from spoiler reviews

# isolate spoiler reviews
df_spoiler = df_crime[df_crime['has_spoiler']==True]
#
##isolate labels sentences from spoiler reviews
sentences = []
#
for review in tqdm(df_spoiler['sentence_text_spellchecked']):
    for sentence in review:
        sentences.append(sentence)
#
labels = []
for review in tqdm(df_spoiler['sentence_labels']):
    for label in review:
        labels.append(label)
#
#
df_sentences = pd.DataFrame({
    'has_spoiler': labels,
    'sentences': sentences
})

## get sentence vectors from fasttext model
df_spoiler_vec = pd.DataFrame(columns = range(300))
for sen in tqdm(df_sentences['sentences']):
    vec = ft.get_sentence_vector(sen)
    df_spoiler_vec = df_spoiler_vec.append(pd.Series(vec), ignore_index=True )

    # test different classifiers on the dataset
test_clfs(models2, df_spoiler_vec, df_sentences['has_spoiler'])

100%|██████████| 3633/3633 [00:00<00:00, 28306.73it/s]
100%|██████████| 3633/3633 [00:00<00:00, 77521.74it/s]
100%|██████████| 72719/72719 [57:11<00:00, 21.19it/s]  
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Done with SGD
Done with LinearSVC
Done with LogReg
Done with Ridge
Done with D_Tree_CLF
Done with KNN


Unnamed: 0,classifier,fit_time,accuracy,recall,precision,roc_auc,f1_macro
0,SGDClassifier(n_jobs=-1),1.845 +/- 0.139,0.724 +/- 0.0,0.0 +/- 0.0,0.0 +/- 0.0,0.733 +/- 0.008,0.42 +/- 0.0
1,"LinearSVC(max_iter=5000, random_state=42)",6.304 +/- 0.605,0.739 +/- 0.002,0.177 +/- 0.002,0.59 +/- 0.014,0.739 +/- 0.008,0.557 +/- 0.003
2,"LogisticRegression(n_jobs=-1, random_state=42)",7.887 +/- 1.151,0.739 +/- 0.002,0.182 +/- 0.003,0.588 +/- 0.014,0.739 +/- 0.008,0.559 +/- 0.003
3,RidgeClassifier(random_state=42),0.752 +/- 0.127,0.737 +/- 0.001,0.149 +/- 0.002,0.593 +/- 0.011,0.736 +/- 0.007,0.54 +/- 0.002
4,"DecisionTreeClassifier(max_depth=50, random_st...",62.302 +/- 3.782,0.653 +/- 0.004,0.396 +/- 0.006,0.377 +/- 0.007,0.574 +/- 0.005,0.572 +/- 0.005
5,KNeighborsClassifier(),3.208 +/- 2.735,0.709 +/- 0.004,0.392 +/- 0.009,0.468 +/- 0.008,0.676 +/- 0.002,0.616 +/- 0.004


Wenn classifying sentences from spoiler reviews using embedding vectors, KNN shows somewhat promising resuslts. Together with others it might be used for stacking. 

In [None]:
# grid search for KNN
from sklearn.model_selection import GridSearchCV
params = {'n_neighbors': [2,4,6,8,10,15], 
         'weights': ['uniform', 'distance']}

grid_KNN =GridSearchCV(KNeighborsClassifier(), param_grid = params, cv = 5, scoring = 'recall', verbose = 5)
grid_KNN.fit(df_spoiler_vec, df_sentences['has_spoiler'])

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] n_neighbors=2, weights=uniform ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... n_neighbors=2, weights=uniform, score=0.222, total= 8.2min
[CV] n_neighbors=2, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.2min remaining:    0.0s


[CV] ...... n_neighbors=2, weights=uniform, score=0.226, total= 8.0min
[CV] n_neighbors=2, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 16.2min remaining:    0.0s


[CV] ...... n_neighbors=2, weights=uniform, score=0.226, total= 8.1min
[CV] n_neighbors=2, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 24.3min remaining:    0.0s


[CV] ...... n_neighbors=2, weights=uniform, score=0.222, total= 8.1min
[CV] n_neighbors=2, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 32.5min remaining:    0.0s


[CV] ...... n_neighbors=2, weights=uniform, score=0.221, total= 7.8min
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.427, total= 8.0min
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.431, total= 8.0min
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.440, total= 8.1min
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.418, total= 8.0min
[CV] n_neighbors=2, weights=distance .................................
[CV] ..... n_neighbors=2, weights=distance, score=0.439, total= 8.0min
[CV] n_neighbors=4, weights=uniform ..................................
[CV] ...... n_neighbors=4, weights=uniform, score=0.264, total= 7.7min
[CV] n_neighbors=4, weights=uniform ..................................
[CV] .

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier,RUSBoostClassifier

models_imblearn = {}
models_imblearn['BalancedRandomForestClassifier'] = BalancedRandomForestClassifier()
models_imblearn['Bagging-SVC'] = BalancedBaggingClassifier(base_estimator= LinearSVC())
models_imblearn['Bagging-KNN'] = BalancedBaggingClassifier(base_estimator= KNeighborsClassifier())
models_imblearn['Bagging-Logistic'] = BalancedBaggingClassifier(base_estimator= LogisticRegression())
models_imblearn['Bagging-Tree'] = BalancedBaggingClassifier(base_estimator= DecisionTreeClassifier())
models_imblearn['Bagging-Ridge'] = BalancedBaggingClassifier(base_estimator= RidgeClassifier())
models_imblearn['RUSBoost'] = RUSBoostClassifier()


test_clfs(models_imblearn, df_spoiler_vec, df_sentences['has_spoiler'])