In [0]:
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline as imPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.base import TransformerMixin




In [0]:
# from google.colab import drive
# drive.mount('/content/drive')
# root_path = "/content/drive/My Drive/notebooks"
# # For Google colab only

df = pd.read_csv(f"../kindle_reviews_2million.csv")
df = df[df['reviewText'].notna()]
df = df[~df['reviewText'].str.contains(".jpg|.png|.jpeg|.tiff|.gif|.bmp|.heif", regex=True, na=False)]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  interactivity=interactivity, compiler=compiler, result=result)


Split data

In [0]:
### Convert to a two class problem -  optional

# df = df[df['overall'] != 3]
# df['overall'] = np.where(df['overall'] > 3, 1, 0)

In [0]:
X = df.reviewText.values
y = df.overall.values

In [0]:
num = len(df)
X, y  = X[:num], y[:num]
le = LabelEncoder()
y = le.fit_transform(y)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [0]:
def train_predict(pipline):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = classification_report(y_test, y_pred)
    #score = roc_auc_score(y_test, y_pred, average="macro")
    #print('AUC: ', roc_auc_score(y_test, y_pred))
    print(score)

Vectorizers


In [0]:
count_vectoriser = Pipeline([
                ('countVectoriser', CountVectorizer())
            ])

tfidf_vectoriser = Pipeline([
                ('tfidfVectoriser', TfidfVectorizer(stop_words=STOP_WORDS
                                                    ))
            ])

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [0]:
def make_balance_pipe(steps):

    """Used to make a pipeline using imblearn"""    

    x = list(range(len(steps)))
    pipeline= [(str(x),y) for x,y in zip(x,steps)]

    return imPipeline(pipeline)


### Stantard Models

Naive Bayes


In [0]:
naive_bayes = Pipeline([
    ('classifier', MultinomialNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', naive_bayes)
])

# pipeline = make_balance_pipe([TfidfVectorizer(),
#                               RandomUnderSampler(),
#                               MultinomialNB()])


In [0]:
train_predict(pipeline)

  'stop_words.' % sorted(inconsistent))


              precision    recall  f1-score   support

           0       0.58      0.58      0.58    163059
           1       0.42      0.46      0.44    163100
           2       0.46      0.39      0.42    162996
           3       0.47      0.43      0.45    161668
           4       0.63      0.72      0.67    161597

    accuracy                           0.51    812420
   macro avg       0.51      0.52      0.51    812420
weighted avg       0.51      0.51      0.51    812420



Gaussian Naive Bayes

In [0]:
from sklearn.naive_bayes import GaussianNB

In [0]:
g_naive_bayes = Pipeline([
    ('classifier', GaussianNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('to_dense', DenseTransformer()), 
    ('classifier', g_naive_bayes)
])

In [0]:
train_predict(pipeline)

  'stop_words.' % sorted(inconsistent))


Quadratic Discriminant Analysis

In [0]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [0]:
QDA = Pipeline([
    ('classifier', QuadraticDiscriminantAnalysis()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('to_dense', DenseTransformer()), 
    ('classifier', QDA)
])

In [0]:
train_predict(pipeline)

Max Entropy

In [0]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [0]:

max_ent = Pipeline([
    ('classifier', LogisticRegression(penalty='l2', C=1.0)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', max_ent)
])

In [0]:
train_predict(pipeline)

  'stop_words.' % sorted(inconsistent))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.60      0.62      0.61    163059
           1       0.45      0.46      0.45    163100
           2       0.47      0.42      0.44    162996
           3       0.50      0.49      0.49    161668
           4       0.67      0.71      0.69    161597

    accuracy                           0.54    812420
   macro avg       0.54      0.54      0.54    812420
weighted avg       0.54      0.54      0.54    812420



Logistic Regression

In [0]:
# params={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
# logistic_regression = Pipeline([
#     ('classifier', GridSearchCV(LogisticRegression(), params, cv=10)),
# ])

logistic_regression = Pipeline([
    ('classifier', LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', logistic_regression)
])

In [0]:
train_predict(pipeline)

              precision    recall  f1-score   support

           0       0.60      0.64      0.62    163059
           1       0.46      0.44      0.45    163100
           2       0.48      0.44      0.46    162996
           3       0.51      0.52      0.51    161668
           4       0.69      0.73      0.71    161597

    accuracy                           0.55    812420
   macro avg       0.55      0.55      0.55    812420
weighted avg       0.55      0.55      0.55    812420



 Decision Tree

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
decision_tree = Pipeline([
    ('classifier', DecisionTreeClassifier(),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', decision_tree)
])

### Bagging Models

Random Forest

In [0]:
# random_forest = Pipeline([
#     ('classifier', RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid,
#                                       n_iter = 100, cv = 3, verbose=2, random_state=42,
#                                       n_jobs = -1)),
# ])

random_forest = Pipeline([
    ('classifier', RandomForestClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', random_forest)
])

In [0]:
train_predict(pipeline)

### Boosting Models

XGBoost


In [0]:
# !wget https://s3-us-west-2.amazonaws.com/xgboost-wheels/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl
# !pip uninstall xgboost --yes
# !pip install xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl

# For Google Colab Only

In [0]:
from xgboost import XGBClassifier

In [0]:
params = {
 "num_class": 5,
 "objective": "multi:softmax"}

xg_boost = Pipeline([
    ('classifier', XGBClassifier(**params))
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', xg_boost)
])


In [0]:
train_predict(pipeline)

  'stop_words.' % sorted(inconsistent))


AdaBoost


In [0]:
from sklearn.ensemble import AdaBoostClassifier


In [0]:

adaBoost = Pipeline([
    ('classifier', AdaBoostClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', adaBoost)
])


In [0]:
train_predict(pipeline)

  'stop_words.' % sorted(inconsistent))


              precision    recall  f1-score   support

           0       0.39      0.67      0.49    163059
           1       0.39      0.21      0.27    163100
           2       0.38      0.25      0.31    162996
           3       0.37      0.36      0.36    161668
           4       0.55      0.62      0.58    161597

    accuracy                           0.42    812420
   macro avg       0.42      0.42      0.40    812420
weighted avg       0.42      0.42      0.40    812420




Voting classifier

In [0]:
from sklearn.ensemble import VotingClassifier



In [0]:
models = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(),
    SGDClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier()
]

m_names = [m.__class__.__name__ for m in models]

models = list(zip(m_names, models))
vc = VotingClassifier(estimators=models)

### Voting Classifier

In [0]:



votingClassifier = Pipeline([
    ('classifier', vc),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', votingClassifier)
])




In [0]:
train_predict(pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
