In [36]:

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd


In [9]:
df = pd.read_csv("../cleaned_sm.csv")
df = df.dropna()


Split data

In [10]:
X = df.reviewText.values
y = df.overall.values

In [None]:
num = len(df)
X, y  = X[:num], y[:num]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [30]:

def train_predict(pipline):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = classification_report(y_test, y_pred)
    print(score)


Vectorizers


In [None]:
count_vectoriser = Pipeline([
                ('countVectoriser', CountVectorizer(max_features=1000, binary=True))
            ])

tfidf_vectoriser = Pipeline([
                ('tfidfVectoriser', TfidfVectorizer(max_features=1000, binary=True))
            ])


Naive Bayes


In [None]:
naive_bayes = Pipeline([
    ('classifier', MultinomialNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', naive_bayes)
])


In [22]:
train_predict(pipeline)

Max Entropy

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:

max_ent = Pipeline([
    ('classifier', LogisticRegression(penalty='l2', C=1.0)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', max_ent)
])

In [31]:
train_predict(pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           1       0.47      0.30      0.37      7500
           2       0.38      0.18      0.24     11286
           3       0.46      0.31      0.37     31451
           4       0.50      0.32      0.39     83840
           5       0.72      0.90      0.80    188902

    accuracy                           0.65    322979
   macro avg       0.50      0.40      0.43    322979
weighted avg       0.62      0.65      0.62    322979



Logistic Regression

In [35]:
params={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
logistic_regression = Pipeline([
    ('classifier', GridSearchCV(LogisticRegression(), params, cv=10)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', logistic_regression)
])

In [33]:
train_predict(pipeline)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           1       0.47      0.30      0.37      7500
           2       0.38      0.18      0.24     11286
           3       0.46      0.31      0.37     31451
           4       0.50      0.32      0.39     83840
           5       0.72      0.90      0.80    188902

    accuracy                           0.65    322979
   macro avg       0.50      0.40      0.43    322979
weighted avg       0.62      0.65      0.62    322979



Random Forest

In [37]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [38]:
random_forest = Pipeline([
    ('classifier', RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, 
                                      n_iter = 100, cv = 3, verbose=2, random_state=42, 
                                      n_jobs = -1)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', random_forest)
])

In [39]:
train_predict(pipeline)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 