In [1]:
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline as imPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.base import TransformerMixin
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [24]:
# from google.colab import drive
# drive.mount('/content/drive')
# root_path = "/content/drive/My Drive/notebooks"
# # For Google colab only

df = pd.read_csv(f"../new_clean_sm_100000.csv")
df = df[df['reviewText'].notna()]
df = df[~df['reviewText'].str.contains(".jpg|.png|.jpeg|.tiff|.gif|.bmp|.heif", regex=True, na=False)]

Split data

In [25]:
### Convert to a two class problem -  optional

df = df[df['overall'] != 2]
df = df[df['overall'] != 4]
df.loc[df['overall'] == 1, 'overall'] = 0
df.loc[df['overall'] == 3, 'overall'] = 1
df.loc[df['overall'] == 5, 'overall'] = 2

In [26]:
X = df.reviewText.values
y = df.overall.values

In [27]:
num = len(df)
X, y  = X[:num], y[:num]
# le = LabelEncoder()
# y = le.fit_transform(y)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [29]:
def train_predict(pipline):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = classification_report(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    #score = roc_auc_score(y_test, y_pred, average="macro")
    #print('AUC: ', roc_auc_score(y_test, y_pred))
    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    print(score)
    print(f"f1 score is {f1}, accuracy is {accuracy}")

Vectorizers


In [36]:
count_vectoriser = Pipeline([
                ('countVectoriser', CountVectorizer())
            ])

tfidf_vectoriser = Pipeline([
                ('tfidfVectoriser', TfidfVectorizer(stop_words=STOP_WORDS
                                                    ))
            ])

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [31]:
def make_balance_pipe(steps):

    """Used to make a pipeline using imblearn"""    

    x = list(range(len(steps)))
    pipeline= [(str(x),y) for x,y in zip(x,steps)]

    return imPipeline(pipeline)


### Stantard Models

Naive Bayes


In [37]:
naive_bayes = Pipeline([
    ('classifier', MultinomialNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', naive_bayes)
])

# pipeline = make_balance_pipe([TfidfVectorizer(),
#                               RandomUnderSampler(),
#                               MultinomialNB()])


In [38]:
train_predict(pipeline)

  'stop_words.' % sorted(inconsistent))


              precision    recall  f1-score   support

         0.0       0.76      0.77      0.76     32880
         1.0       0.67      0.67      0.67     32922
         2.0       0.82      0.82      0.82     33069

    accuracy                           0.75     98871
   macro avg       0.75      0.75      0.75     98871
weighted avg       0.75      0.75      0.75     98871

f1 score is 0.7512440533658252, accuracy is 0.7511201464534596


Gaussian Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB

In [24]:
g_naive_bayes = Pipeline([
    ('classifier', GaussianNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('to_dense', DenseTransformer()), 
    ('classifier', g_naive_bayes)
])

In [25]:
train_predict(pipeline)

  'stop_words.' % sorted(inconsistent))


              precision    recall  f1-score   support

           0       0.20      0.21      0.21      6581
           1       0.21      0.23      0.22      6641
           2       0.20      0.19      0.20      6575
           3       0.20      0.19      0.19      6573
           4       0.21      0.21      0.21      6572

    accuracy                           0.21     32942
   macro avg       0.21      0.21      0.21     32942
weighted avg       0.21      0.21      0.21     32942



Quadratic Discriminant Analysis

In [20]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [21]:
QDA = Pipeline([
    ('classifier', QuadraticDiscriminantAnalysis()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('to_dense', DenseTransformer()), 
    ('classifier', QDA)
])

In [22]:
train_predict(pipeline)

              precision    recall  f1-score   support

           0       0.20      0.23      0.22      6581
           1       0.20      0.20      0.20      6641
           2       0.20      0.19      0.20      6575
           3       0.21      0.18      0.19      6573
           4       0.22      0.22      0.22      6572

    accuracy                           0.21     32942
   macro avg       0.21      0.21      0.21     32942
weighted avg       0.21      0.21      0.21     32942



Max Entropy

In [23]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [24]:

max_ent = Pipeline([
    ('classifier', LogisticRegression(penalty='l2', C=1.0)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', max_ent)
])

In [25]:
train_predict(pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         1.0       0.21      0.20      0.20      6581
         2.0       0.20      0.20      0.20      6641
         3.0       0.20      0.20      0.20      6575
         4.0       0.20      0.19      0.19      6573
         5.0       0.21      0.23      0.22      6572

    accuracy                           0.21     32942
   macro avg       0.20      0.21      0.20     32942
weighted avg       0.20      0.21      0.20     32942

f1 score is 0.20494872650661872, accuracy is 0.2051484427175035


Logistic Regression

In [0]:
# params={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
# logistic_regression = Pipeline([
#     ('classifier', GridSearchCV(LogisticRegression(), params, cv=10)),
# ])

logistic_regression = Pipeline([
    ('classifier', LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', logistic_regression)
])

In [0]:
train_predict(pipeline)

              precision    recall  f1-score   support

           0       0.60      0.64      0.62    163059
           1       0.46      0.44      0.45    163100
           2       0.48      0.44      0.46    162996
           3       0.51      0.52      0.51    161668
           4       0.69      0.73      0.71    161597

    accuracy                           0.55    812420
   macro avg       0.55      0.55      0.55    812420
weighted avg       0.55      0.55      0.55    812420



 Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
decision_tree = Pipeline([
    ('classifier', DecisionTreeClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', decision_tree)
])

In [28]:

train_predict(pipeline)

              precision    recall  f1-score   support

         1.0       0.20      0.20      0.20      6581
         2.0       0.21      0.21      0.21      6641
         3.0       0.19      0.19      0.19      6575
         4.0       0.20      0.20      0.20      6573
         5.0       0.20      0.20      0.20      6572

    accuracy                           0.20     32942
   macro avg       0.20      0.20      0.20     32942
weighted avg       0.20      0.20      0.20     32942

f1 score is 0.2010128486000343, accuracy is 0.2010199745006375


### Bagging Models

Random Forest

In [0]:
# random_forest = Pipeline([
#     ('classifier', RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid,
#                                       n_iter = 100, cv = 3, verbose=2, random_state=42,
#                                       n_jobs = -1)),
# ])

random_forest = Pipeline([
    ('classifier', RandomForestClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', random_forest)
])

In [29]:
train_predict(pipeline)

              precision    recall  f1-score   support

         1.0       0.20      0.21      0.20      6581
         2.0       0.21      0.20      0.20      6641
         3.0       0.20      0.20      0.20      6575
         4.0       0.20      0.20      0.20      6573
         5.0       0.20      0.20      0.20      6572

    accuracy                           0.20     32942
   macro avg       0.20      0.20      0.20     32942
weighted avg       0.20      0.20      0.20     32942

f1 score is 0.2028050536791965, accuracy is 0.20281100115354259


### Boosting Models

XGBoost


In [0]:
# !wget https://s3-us-west-2.amazonaws.com/xgboost-wheels/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl
# !pip uninstall xgboost --yes
# !pip install xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl

# For Google Colab Only

In [18]:
from xgboost import XGBClassifier

In [21]:
params = {
 "num_class": 5,
 "objective": "multi:softmax",
"n_jobs":-1}

xg_boost = Pipeline([
    ('classifier', XGBClassifier(**params))
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', xg_boost)
])


In [22]:
train_predict(pipeline)

              precision    recall  f1-score   support

         0.0       0.72      0.80      0.76    163167
         1.0       0.69      0.66      0.68    162314
         2.0       0.83      0.78      0.80    161558

    accuracy                           0.75    487039
   macro avg       0.75      0.75      0.75    487039
weighted avg       0.75      0.75      0.75    487039

f1 score is 0.7463895964121083, accuracy is 0.7466301466617663


AdaBoost


In [30]:
from sklearn.ensemble import AdaBoostClassifier


In [31]:

adaBoost = Pipeline([
    ('classifier', AdaBoostClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', adaBoost)
])


In [32]:
train_predict(pipeline)

              precision    recall  f1-score   support

         1.0       0.19      0.05      0.08      6581
         2.0       0.21      0.09      0.12      6641
         3.0       0.20      0.71      0.31      6575
         4.0       0.20      0.03      0.06      6573
         5.0       0.22      0.13      0.17      6572

    accuracy                           0.20     32942
   macro avg       0.21      0.20      0.15     32942
weighted avg       0.21      0.20      0.15     32942

f1 score is 0.1455947998473248, accuracy is 0.2014449638759031



Voting classifier

In [0]:
from sklearn.ensemble import VotingClassifier



In [0]:
models = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(),
    SGDClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier()
]

m_names = [m.__class__.__name__ for m in models]

models = list(zip(m_names, models))
vc = VotingClassifier(estimators=models)

### Voting Classifier

In [0]:



votingClassifier = Pipeline([
    ('classifier', vc),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', votingClassifier)
])




In [0]:
train_predict(pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
