In [20]:
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline as imPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.base import TransformerMixin
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from IPython.display import display, HTML


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# root_path = "/content/drive/My Drive/notebooks"
# # For Google colab only

df = pd.read_csv(f"../new_clean_sm_100000.csv")
df = df[df['reviewText'].notna()]
df = df[~df['reviewText'].str.contains(".jpg|.png|.jpeg|.tiff|.gif|.bmp|.heif", regex=True, na=False)]

Split data

In [3]:
### Convert to a two class problem -  optional

df = df[df['overall'] != 2]
df = df[df['overall'] != 4]
df.loc[df['overall'] == 1, 'overall'] = 0
df.loc[df['overall'] == 3, 'overall'] = 1
df.loc[df['overall'] == 5, 'overall'] = 2

In [4]:
X = df.reviewText.values
y = df.overall.values

In [5]:
num = len(df)
X, y  = X[:num], y[:num]
# le = LabelEncoder()
# y = le.fit_transform(y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
def train_predict(pipline):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = classification_report(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    print(score)
    print(f"f1 score is {f1}, accuracy is {accuracy}")

def train_predict_all(pipeline):
    out = pipeline.fit(X, y).cv_results_
    results_df = pd.DataFrame({'rank': out['rank_test_score'],
                          'params': out['params'],
                           'cv score (mean)': out['mean_test_score']})
    results_df = results_df.sort_values(by=['rank'], ascending=True)
    pd.set_option('display.max_colwidth',100)
    display(HTML(results_df.to_html()))
    return out
    # score = classification_report(y_test, y_pred)
    # f1 = f1_score(y_test, y_pred, average='macro')
    # accuracy = accuracy_score(y_test, y_pred)
    # print(score)
    # print(f"f1 score is {f1}, accuracy is {accuracy}")


Vectorizers


In [25]:


count_vectoriser = Pipeline([
                ('countVectoriser', CountVectorizer())
            ])

tfidf_vectoriser = Pipeline([
                ('tfidfVectoriser', TfidfVectorizer(stop_words=STOP_WORDS
                                                    ))
            ])


### Stantard Models

Naive Bayes -  Fine Tune


In [26]:
param_grid = {'vectoriser__tfidfVectoriser__ngram_range': [(1,1),(1,2)],
              'classifier__classifier__alpha': [1e-5, 1e-4, 1e-2, 1e-1,1]}


# param_grid = {'vectoriser__tfidfVectoriser__ngram_range': [(1,1)],
#               'classifier__classifier__alpha': [1e-5]}

naive_bayes = Pipeline([
    ('classifier', MultinomialNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', naive_bayes)
])


In [27]:
gs_mnb = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=3)
final_results = train_predict_all(gs_mnb)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  7.8min
[Parallel(n_jobs=3)]: Done  50 out of  50 | elapsed: 12.0min finished
  'stop_words.' % sorted(inconsistent))


Unnamed: 0,rank,params,cv score (mean)
9,1,"{'classifier__classifier__alpha': 1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.782966
7,2,"{'classifier__classifier__alpha': 0.1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.778564
5,3,"{'classifier__classifier__alpha': 0.01, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.759132
8,4,"{'classifier__classifier__alpha': 1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.752286
6,5,"{'classifier__classifier__alpha': 0.1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.745731
4,6,"{'classifier__classifier__alpha': 0.01, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.737958
2,7,"{'classifier__classifier__alpha': 0.0001, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.726656
0,8,"{'classifier__classifier__alpha': 1e-05, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.723352
3,9,"{'classifier__classifier__alpha': 0.0001, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.722207
1,10,"{'classifier__classifier__alpha': 1e-05, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.710515


Max Entropy

In [30]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [24]:

max_ent = Pipeline([
    ('classifier', LogisticRegression(penalty='l2', C=1.0)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', max_ent)
])

In [25]:
train_predict(pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         1.0       0.21      0.20      0.20      6581
         2.0       0.20      0.20      0.20      6641
         3.0       0.20      0.20      0.20      6575
         4.0       0.20      0.19      0.19      6573
         5.0       0.21      0.23      0.22      6572

    accuracy                           0.21     32942
   macro avg       0.20      0.21      0.20     32942
weighted avg       0.20      0.21      0.20     32942

f1 score is 0.20494872650661872, accuracy is 0.2051484427175035


Logistic Regression - Fine Tune

In [39]:
param_grid={"classifier__classifier__C":np.logspace(-3,3,7), "classifier__classifier__penalty":["l1","l2"],
             'vectoriser__tfidfVectoriser__ngram_range': [(1,1),(1,2)]}

logistic_regression = Pipeline([
    ('classifier', LogisticRegression()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', logistic_regression)
])

In [40]:
gs_mnb = RandomizedSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=3)
results = train_predict_all(gs_mnb)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}

 Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
decision_tree = Pipeline([
    ('classifier', DecisionTreeClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', decision_tree)
])

In [28]:

train_predict(pipeline)

              precision    recall  f1-score   support

         1.0       0.20      0.20      0.20      6581
         2.0       0.21      0.21      0.21      6641
         3.0       0.19      0.19      0.19      6575
         4.0       0.20      0.20      0.20      6573
         5.0       0.20      0.20      0.20      6572

    accuracy                           0.20     32942
   macro avg       0.20      0.20      0.20     32942
weighted avg       0.20      0.20      0.20     32942

f1 score is 0.2010128486000343, accuracy is 0.2010199745006375


### Bagging Models

Random Forest

In [0]:
# random_forest = Pipeline([
#     ('classifier', RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid,
#                                       n_iter = 100, cv = 3, verbose=2, random_state=42,
#                                       n_jobs = -1)),
# ])

random_forest = Pipeline([
    ('classifier', RandomForestClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', random_forest)
])

In [29]:
train_predict(pipeline)

              precision    recall  f1-score   support

         1.0       0.20      0.21      0.20      6581
         2.0       0.21      0.20      0.20      6641
         3.0       0.20      0.20      0.20      6575
         4.0       0.20      0.20      0.20      6573
         5.0       0.20      0.20      0.20      6572

    accuracy                           0.20     32942
   macro avg       0.20      0.20      0.20     32942
weighted avg       0.20      0.20      0.20     32942

f1 score is 0.2028050536791965, accuracy is 0.20281100115354259


### Boosting Models

XGBoost


In [0]:
# !wget https://s3-us-west-2.amazonaws.com/xgboost-wheels/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl
# !pip uninstall xgboost --yes
# !pip install xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl

# For Google Colab Only

In [18]:
from xgboost import XGBClassifier

In [21]:
params = {
 "num_class": 5,
 "objective": "multi:softmax",
"n_jobs":-1}

xg_boost = Pipeline([
    ('classifier', XGBClassifier(**params))
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', xg_boost)
])


In [22]:
train_predict(pipeline)

              precision    recall  f1-score   support

         0.0       0.72      0.80      0.76    163167
         1.0       0.69      0.66      0.68    162314
         2.0       0.83      0.78      0.80    161558

    accuracy                           0.75    487039
   macro avg       0.75      0.75      0.75    487039
weighted avg       0.75      0.75      0.75    487039

f1 score is 0.7463895964121083, accuracy is 0.7466301466617663


AdaBoost


In [30]:
from sklearn.ensemble import AdaBoostClassifier


In [31]:

adaBoost = Pipeline([
    ('classifier', AdaBoostClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', adaBoost)
])


In [32]:
train_predict(pipeline)


              precision    recall  f1-score   support

         1.0       0.19      0.05      0.08      6581
         2.0       0.21      0.09      0.12      6641
         3.0       0.20      0.71      0.31      6575
         4.0       0.20      0.03      0.06      6573
         5.0       0.22      0.13      0.17      6572

    accuracy                           0.20     32942
   macro avg       0.21      0.20      0.15     32942
weighted avg       0.21      0.20      0.15     32942

f1 score is 0.1455947998473248, accuracy is 0.2014449638759031


In [0]:
train_predict(pipeline)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [35]:
from pprint import pprint
pprint(gs_mnb.estimator.get_params().keys())

dict_keys(['memory', 'steps', 'verbose', 'vectoriser', 'classifier', 'vectoriser__memory', 'vectoriser__steps', 'vectoriser__verbose', 'vectoriser__tfidfVectoriser', 'vectoriser__tfidfVectoriser__analyzer', 'vectoriser__tfidfVectoriser__binary', 'vectoriser__tfidfVectoriser__decode_error', 'vectoriser__tfidfVectoriser__dtype', 'vectoriser__tfidfVectoriser__encoding', 'vectoriser__tfidfVectoriser__input', 'vectoriser__tfidfVectoriser__lowercase', 'vectoriser__tfidfVectoriser__max_df', 'vectoriser__tfidfVectoriser__max_features', 'vectoriser__tfidfVectoriser__min_df', 'vectoriser__tfidfVectoriser__ngram_range', 'vectoriser__tfidfVectoriser__norm', 'vectoriser__tfidfVectoriser__preprocessor', 'vectoriser__tfidfVectoriser__smooth_idf', 'vectoriser__tfidfVectoriser__stop_words', 'vectoriser__tfidfVectoriser__strip_accents', 'vectoriser__tfidfVectoriser__sublinear_tf', 'vectoriser__tfidfVectoriser__token_pattern', 'vectoriser__tfidfVectoriser__tokenizer', 'vectoriser__tfidfVectoriser__use_id