##  Bag of Words, Bag of Popcorn

[Bag of Words, Bag of Popcorn](https://www.kaggle.com/c/word2vec-nlp-tutorial/data) competition. 

Use NLP feature pre-processing (using, SKLearn, Gensim, Spacy or Hugginface) to build the best classifier you can. Use a  feature pipeline, and gridsearch for your final model.

A succesful project should get 90% or more on a **holdout** dataset you kept for yourself.

In [1]:
################################################
# General imports
import pandas as pd
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
import numpy as np
import re


################################################
# gensim imports
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

################################################
# Sklearn imports
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.experimental import enable_halving_search_cv 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer 
from sklearn.preprocessing import Normalizer





### Resources:


- doc2vec white paper: https://cs.stanford.edu/~quocle/paragraph_vector.pdf
- https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html
- https://github.com/RaRe-Technologies/gensim/blob/bcee414663bdcbdf6a58684531ee69c6949550bf/docs/src/gallery/howtos/run_doc2vec_imdb.py

In [2]:
# exercise 2
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
train_data = pd.read_csv('../data/labeledTrainData.tsv',delimiter='\t')
test_data = pd.read_csv('../data/testData.tsv',delimiter='\t')
unlabeled_train_data = pd.read_csv('../data/unlabeledTrainData.tsv',delimiter='\t', on_bad_lines='skip')
sample_submission = pd.read_csv('../data/sampleSubmission.csv')

In [6]:
def text_clean(text):

    text= BeautifulSoup(text, "html.parser")
    text = text.get_text(separator=" ")
    
    
    # # Remove mentions
    # mention_pattern = r'@\w+'
    # text = re.sub(mention_pattern, '', text)  

    #text in lower case
    text = str(text).lower()

    # #remove emojis
    # text = emoji.demojize(text)
    # text = re.sub(r':\w+', '', text)

    #remove words starting with '@'
    text = re.sub(r'@\w+', '', text)

    #remove points
    text = text.replace('.', '') 

    #remove 2 points
    text = text.replace(':', '')

    #remove urls
    text = re.sub('http\S+|www.\S+', '', text)
    
    #convert currency signs to words
    currency_dict = {
    '$': 'dollars',
    '£': 'pounds',
    '€': 'euros',
    '¥': 'yen',
    '₹': 'rupees',
    }
    for symbol, word in currency_dict.items():
        text = text.replace(symbol, word)

    # #convert numbers into words
    # matches = re.findall(r'\d+', text)
    # for match in matches:
    #     word = num2words(int(match))
    #     text = text.replace(match, word)

    #remove numeric values
    text = re.sub(r'\d+', '', text)

    #use the contractions to expand contractions (ex. doesn't to does not)
    # text = contractions.fix(text)

    # #remove punctuation
    # translator = str.maketrans('', '', string.punctuation)
    # text = text.translate(translator)
   

    #remove multiple spaces
    text = re.sub(' +', ' ', text)

    #remove <> and content inside
    text = re.sub('<.*?>+', '', text)

    #remove any ASCII character thats left like ¿
    text = text.replace("¿", '')
    text = re.sub(r'[\\\'"‘’“”()]', '', text)


    text= BeautifulSoup(text, "html.parser")
    text = text.get_text(separator=" ")
    return text

In [7]:
# not used

def remove_html(text):
    text= BeautifulSoup(text, "html.parser")
    text = text.get_text(separator=" ")
    return text

In [8]:
#trying more text cleaning


train_data['review'] = train_data['review'].apply(text_clean)
train_data['tokens'] = train_data['review'].apply(gensim.utils.simple_preprocess)


unlabeled_train_data['review'] = unlabeled_train_data['review'].apply(text_clean)
unlabeled_train_data['tokens'] = unlabeled_train_data['review'].apply(gensim.utils.simple_preprocess)



In [9]:
train_data.sentiment.value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [10]:
X = train_data['tokens']
y = train_data['sentiment']
# stratify as we want 0 and 1s in the train and test altho probably not necessary as there's an equal number of each
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)
X_train, X_test, y_train, y_test = list(X_train), list(X_test), list(y_train), list(y_test)



In [11]:
# Larger unlabeled training set used to train the gensim model doc2vec
X_un = unlabeled_train_data['tokens']

In [14]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [15]:
doc_train = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]
doc_test = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_test)]
doc_unlabeled = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_un)]
#DBOW is the doc2vec model analogous to Skip-gram model in word2vec
# dm = distrubuted bag of words
# vector_size = num of features
# epoch = number of times the model is run 
# min_count = set to 2 so it doesn't look at words that don't occur oftern
# eg a word needs to be seen twice to be included
model_dbow = Doc2Vec(vector_size=100, 
                     dm = 0, 
                     window=8, 
                     workers=cores,
                     epochs=40,
                     min_count=4)


In [16]:
full_doc = doc_test+doc_train+doc_unlabeled

In [17]:
# buld the vocabulary for the model
model_dbow.build_vocab(full_doc)


In [18]:
model_dbow.train(doc_unlabeled, 
                 total_examples=model_dbow.corpus_count,
                 epochs=model_dbow.epochs
                )

### Vectorizing the training and test sets

In [30]:
train_vec_list = []
for doc_id in range(len(doc_train)):
    inferred_vector = model_dbow.infer_vector(doc_train[doc_id].words)
    train_vec_list.append(inferred_vector)

In [31]:
test_vec_list= []
for doc_id in range(len(doc_test)):
    inferred_vector = model_dbow.infer_vector(doc_test[doc_id].words)
    test_vec_list.append(inferred_vector)

In [32]:
# not used

def doc_to_vector_list(doc,convert=True):
    
    '''# method to covert a doc using the doc2vec model
        if convert=True it will convert the doc (like pandas in_place)
        if convert=False it will return a copy'''
    temp_vector_list=[]
    if convert == True:
        for doc_id in range(len(doc)):
            inferred_vector = model_dbow.infer_vector(doc[doc_id].words)
            temp_vector_list.append(inferred_vector)
        doc = temp_vector_list
        return doc
    else:
        for doc_id in range(len(doc)):
            inferred_vector = model_dbow.infer_vector(doc[doc_id].words)
            temp_vector_list.append(inferred_vector)
        return temp_vector_list

In [33]:
# redefining X_train and X_test and the vectorized versions of themselves

X_train = train_vec_list
X_test = test_vec_list

### Initial classification models

In [34]:
logit_model = LogisticRegression(max_iter=5000)
train_fit_logit = logit_model.fit(X_train,y_train)           
y_pred_logit = logit_model.predict(X_test)

rand_forest_model = RandomForestClassifier(n_estimators=100,
                                           random_state=42,criterion='gini', 
                                           n_jobs = -1,
                                           oob_score=True)
rand_forest_model.fit(X_train,y_train)
y_pred_forest = rand_forest_model.predict(X_test)

Knn_model = KNeighborsClassifier()
Knn_model.fit(X_train,y_train)
y_pred_Knn = Knn_model.predict(X_test)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [35]:
print(classification_report(y_test,y_pred_logit))
print(classification_report(y_test,y_pred_forest))
print(classification_report(y_test,y_pred_Knn))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      3125
           1       0.88      0.88      0.88      3125

    accuracy                           0.88      6250
   macro avg       0.88      0.88      0.88      6250
weighted avg       0.88      0.88      0.88      6250

              precision    recall  f1-score   support

           0       0.83      0.84      0.84      3125
           1       0.84      0.83      0.84      3125

    accuracy                           0.84      6250
   macro avg       0.84      0.84      0.84      6250
weighted avg       0.84      0.84      0.84      6250

              precision    recall  f1-score   support

           0       0.79      0.79      0.79      3125
           1       0.79      0.79      0.79      3125

    accuracy                           0.79      6250
   macro avg       0.79      0.79      0.79      6250
weighted avg       0.79      0.79      0.79      6250



---
logit model has best acc. will try doing a gridsearch to improve it

will try PCA in the pipeline too

In [56]:
# ########
# #######
# #TAKES A LONG TIME TO RUN
# ########
# #######
# # Define a pipeline to search for the best combination of PCA truncation
# # and classifier regularization.
# pca = PCA()
# # Define a Standard Scaler to normalize inputs
# scaler = RobustScaler()

# # set the tolerance to a large value to make the example faster
# logistic = LogisticRegression(max_iter=1000, tol=0.1,warm_start=True)
# pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("logistic", logistic)])


# # Parameters of pipelines can be set using '__' separated parameter names:
# param_grid = {
#     "pca__n_components": [5, 15, 45, 60, 200],
#     "logistic__C": np.logspace(-8,8,8),
#     'logistic__penalty':['none', 'l1', 'l2', 'elasticnet'],
#     'logistic__solver':['newton-cg', 'lbfgs', 'liblinear']
# }

# search = GridSearchCV(pipe, param_grid, n_jobs=-1,verbose=True)
# search.fit(X_train, y_train)
# y_pred_logit_GS = search.predict(X_Test)
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# print(search.best_params_)
# print(classification_report(y_test,y_pred_logit_GS))

----
Preparing to test different pre-processing

In [38]:
# Transform training data
X1 = StandardScaler().fit_transform(X_train)
X2 = MinMaxScaler().fit_transform(X_train)
X3 = MaxAbsScaler().fit_transform(X_train)
X4 = RobustScaler().fit_transform(X_train)
X5 = PowerTransformer().fit_transform(X_train)
X6 = QuantileTransformer().fit_transform(X_train)
X7 = Normalizer().fit_transform(X_train)
# Transforming test data
X1_test = StandardScaler().fit_transform(X_test)
X2_test = MinMaxScaler().fit_transform(X_test)
X3_test = MaxAbsScaler().fit_transform(X_test)
X4_test = RobustScaler().fit_transform(X_test)
X5_test = PowerTransformer().fit_transform(X_test)
X6_test = QuantileTransformer().fit_transform(X_test)
X7_test = Normalizer().fit_transform(X_test)


X_lst = [X1,X2,X3,X4,X5,X6,X7]
X_test_list = [X1_test,X2_test,X3_test,X4_test,X5_test,X6_test,X7_test]
X_label = ['Standard','MinMax','MaxAbs','Robust','PowerTransformer','QuantileTransformer','Normalizer']


----
Testing different models

In [39]:
from sklearn.linear_model import SGDClassifier
sgdc_clf = SGDClassifier()
sgdc_clf = sgdc_clf.fit(X1, y_train)
y_pred_sgdc = sgdc_clf.predict(X1_test)
print(classification_report(y_test,y_pred_sgdc))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88      3125
           1       0.88      0.87      0.88      3125

    accuracy                           0.88      6250
   macro avg       0.88      0.88      0.88      6250
weighted avg       0.88      0.88      0.88      6250



In [40]:

Lsvm_clf = svm.LinearSVC()
Lsvm_clf = Lsvm_clf.fit(X1, y_train)
y_pred_Lsvm = Lsvm_clf.predict(X1_test)
print(classification_report(y_test,y_pred_Lsvm))

              precision    recall  f1-score   support

           0       0.88      0.89      0.88      3125
           1       0.89      0.88      0.88      3125

    accuracy                           0.88      6250
   macro avg       0.88      0.88      0.88      6250
weighted avg       0.88      0.88      0.88      6250





In [41]:
from sklearn.tree import DecisionTreeClassifier
dtc_clf = DecisionTreeClassifier()
dtc_clf = dtc_clf.fit(X1, y_train)
y_pred_dtc = dtc_clf.predict(X1_test)
print(classification_report(y_test,y_pred_dtc))

              precision    recall  f1-score   support

           0       0.66      0.67      0.66      3125
           1       0.66      0.65      0.66      3125

    accuracy                           0.66      6250
   macro avg       0.66      0.66      0.66      6250
weighted avg       0.66      0.66      0.66      6250



In [42]:
from sklearn.naive_bayes import BernoulliNB
bnb_clf = BernoulliNB()
bnb_clf = bnb_clf.fit(X1, y_train)
y_pred_bnb = bnb_clf.predict(X1_test)
print(classification_report(y_test,y_pred_bnb))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81      3125
           1       0.81      0.82      0.81      3125

    accuracy                           0.81      6250
   macro avg       0.81      0.81      0.81      6250
weighted avg       0.81      0.81      0.81      6250



In [44]:
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_clf = xgb_clf.fit(X1, y_train)
y_pred_xgb = xgb_clf.predict(X1_test)
print(classification_report(y_test,y_pred_xgb))


              precision    recall  f1-score   support

           0       0.86      0.86      0.86      3125
           1       0.86      0.86      0.86      3125

    accuracy                           0.86      6250
   macro avg       0.86      0.86      0.86      6250
weighted avg       0.86      0.86      0.86      6250



In [45]:
clf = svm.SVC()
clf = clf.fit(X1, y_train)
y_pred_svm = clf.predict(X1_test)
print(classification_report(y_test,y_pred_svm))
print(accuracy_score(y_test,y_pred_svm))
#best one yet

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      3125
           1       0.89      0.89      0.89      3125

    accuracy                           0.89      6250
   macro avg       0.89      0.89      0.89      6250
weighted avg       0.89      0.89      0.89      6250

0.88784


In [46]:
# Best logit model 

logit_model_best=LogisticRegression(C=0.001, penalty= 'l2', solver='newton-cg',max_iter=10000)
logit_model_best = logit_model_best.fit(X1,y_train)
y_pred_best_logit = logit_model_best.predict(X1_test)
print(classification_report(y_test,y_pred_best_logit ))

              precision    recall  f1-score   support

           0       0.88      0.89      0.88      3125
           1       0.89      0.88      0.88      3125

    accuracy                           0.88      6250
   macro avg       0.88      0.88      0.88      6250
weighted avg       0.88      0.88      0.88      6250



----
I went back and forth between the next few cells updating according the best results 
SVC was the best classifier I fond


In [47]:
# finding best scaling method to apply
# standard scaler looks good
for i,j in enumerate(X_lst):
    clf = svm.SVC()
    clf = clf.fit(j, y_train)
    y_pred_svm = clf.predict(X_test_list[i])
    print('***********')
    print(X_label[i])
    print(classification_report(y_test,y_pred_svm))
    print(accuracy_score(y_test,y_pred_svm))
    print('************')

***********
Standard
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      3125
           1       0.89      0.89      0.89      3125

    accuracy                           0.89      6250
   macro avg       0.89      0.89      0.89      6250
weighted avg       0.89      0.89      0.89      6250

0.88784
************
***********
MinMax
              precision    recall  f1-score   support

           0       0.95      0.70      0.81      3125
           1       0.76      0.96      0.85      3125

    accuracy                           0.83      6250
   macro avg       0.86      0.83      0.83      6250
weighted avg       0.86      0.83      0.83      6250

0.83408
************
***********
MaxAbs
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      3125
           1       0.88      0.90      0.89      3125

    accuracy                           0.89      6250
   macro avg       0.89    

In [48]:
# doing a full GridSearchCV takes a long time. While trying a bunch of hyperparameter values this method
# was quicker 
# Default values seem to be the best =/
clf = svm.SVC(max_iter=5000)

svc_param_grid = {
                'C': [0.1, 1, 10, 100],  
                'gamma': ['scale','auto',1, 0.1, 0.01, 0.001, 0.0001], 
                'kernel': ['linear','rbf','sigmoid']
                 }  
search = HalvingGridSearchCV(clf, svc_param_grid, verbose =True,n_jobs=-1,scoring='accuracy')
search.fit(X1, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
y_pred_HGS = search.predict(X1_test)
print(classification_report(y_test,y_pred_HGS))
print(accuracy_score(y_test,y_pred_HGS))

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 231
max_resources_: 18750
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 84
n_resources: 231
Fitting 5 folds for each of 84 candidates, totalling 420 fits
----------
iter: 1
n_candidates: 28
n_resources: 693
Fitting 5 folds for each of 28 candidates, totalling 140 fits
----------
iter: 2
n_candidates: 10
n_resources: 2079
Fitting 5 folds for each of 10 candidates, totalling 50 fits
----------
iter: 3
n_candidates: 4
n_resources: 6237
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 4
n_candidates: 2
n_resources: 18711
Fitting 5 folds for each of 2 candidates, totalling 10 fits




Best parameter (CV score=0.887):
{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      3125
           1       0.89      0.89      0.89      3125

    accuracy                           0.89      6250
   macro avg       0.89      0.89      0.89      6250
weighted avg       0.89      0.89      0.89      6250

0.888


In [49]:
# Using the finalised hyperparameters I did a full GridSearch
# Tried these parameters but default were better
# clf = svm.SVC(max_iter=5000)

# svc_param_grid = {
#                 'C': [0.1, 1, 10, 100],  
#                 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
#                 'kernel': ['linear','rbf','sigmoid']
#                  }  
   
# search = GridSearchCV(clf, svc_param_grid, refit = True, verbose =True,n_jobs=-1,scoring='accuracy')
# search.fit(X1, y_train)
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# print(search.best_params_)
# y_pred_GS = search.predict(X1_test)
# print(classification_report(y_test,y_pred_GS))
# print(accuracy_score(y_test,y_pred_GS))

Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best parameter (CV score=0.876):

{'C': 333.334, 'gamma': 0.001, 'kernel': 'rbf'}

In [50]:
# Default values gave best results  after all that hyperparameter tuning
# I guess they're defaults for a reason . . . 
clf = svm.SVC()
clf = clf.fit(X1, y_train)
y_pred_svm = clf.predict(X1_test)
print(classification_report(y_test,y_pred_svm))
print(accuracy_score(y_test,y_pred_svm))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      3125
           1       0.89      0.89      0.89      3125

    accuracy                           0.89      6250
   macro avg       0.89      0.89      0.89      6250
weighted avg       0.89      0.89      0.89      6250

0.88784


### Preparing holdout data


In [51]:
test_data['review'] = test_data['review'].apply(text_clean)
test_data['tokens'] = test_data['review'].apply(gensim.utils.simple_preprocess)

X_holdout = test_data['tokens']
doc_holdout = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_holdout)]



In [52]:
holdout_vec_list = []
for doc_id in range(len(doc_holdout)):
    inferred_vector = model_dbow.infer_vector(doc_holdout[doc_id].words)
    holdout_vec_list.append(inferred_vector)

In [53]:
X_holdout = holdout_vec_list
X_holdout = StandardScaler().fit_transform(X_holdout)

In [55]:

clf = svm.SVC()
clf = clf.fit(X1, y_train)
y_pred_holdout = clf.predict(X_holdout)
y_pred_holdout

array([1, 0, 0, ..., 0, 1, 1])

In [57]:
sample_submission = pd.read_csv('../data/sampleSubmission.csv')

In [58]:
sample_submission

Unnamed: 0,id,sentiment
0,12311_10,0
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0
...,...,...
24995,2155_10,0
24996,59_10,0
24997,2531_1,0
24998,7772_8,0


In [59]:
sample_submission['sentiment']= y_pred_holdout

In [61]:
sample_submission.to_csv('sample_sub_CM.csv',index=False)

----
This submission gets an 89 on kaggle

My disappointment is immeasurable and my day is ruined