In [2]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from spacy.lang.en.stop_words import STOP_WORDS


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')
# root_path = "/content/drive/My Drive/notebooks"
# # For Google colab only

df = pd.read_csv(f"../new_clean_sm_100000.csv")
df = df[df['reviewText'].notna()]
df = df[~df['reviewText'].str.contains(".jpg|.png|.jpeg|.tiff|.gif|.bmp|.heif", regex=True, na=False)]

Split data

In [4]:
### Convert to a two class problem -  optional

df = df[df['overall'] != 2]
df = df[df['overall'] != 4]
df.loc[df['overall'] == 1, 'overall'] = 0
df.loc[df['overall'] == 3, 'overall'] = 1
df.loc[df['overall'] == 5, 'overall'] = 2

In [5]:
X = df.reviewText.values
y = df.overall.values

In [6]:
num = len(df)
X, y  = X[:num], y[:num]
# le = LabelEncoder()
# y = le.fit_transform(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
def train_predict(pipline):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = classification_report(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    print(score)
    print(f"f1 score is {f1}, accuracy is {accuracy}")

def train_predict_all(pipeline):
    out = pipeline.fit(X, y).cv_results_
    results_df = pd.DataFrame({'rank': out['rank_test_score'],
                          'params': out['params'],
                           'cv score (mean)': out['mean_test_score']})
    results_df = results_df.sort_values(by=['rank'], ascending=True)
    pd.set_option('display.max_colwidth',100)
    display(HTML(results_df.to_html()))
    return out
    # score = classification_report(y_test, y_pred)
    # f1 = f1_score(y_test, y_pred, average='macro')
    # accuracy = accuracy_score(y_test, y_pred)
    # print(score)
    # print(f"f1 score is {f1}, accuracy is {accuracy}")


Vectorizers


In [None]:


count_vectoriser = Pipeline([
                ('countVectoriser', CountVectorizer())
            ])

tfidf_vectoriser = Pipeline([
                ('tfidfVectoriser', TfidfVectorizer(stop_words=STOP_WORDS, ngram_range = (1,2)
                                                    ))
            ])


### Stantard Models

Naive Bayes -  Fine Tune


In [None]:
param_grid = {'vectoriser__tfidfVectoriser__ngram_range': [(1,1),(1,2)],
              'classifier__classifier__alpha': [1e-5, 1e-4, 1e-2, 1e-1,1]}


# param_grid = {'vectoriser__tfidfVectoriser__ngram_range': [(1,1)],
#               'classifier__classifier__alpha': [1e-5]}

naive_bayes = Pipeline([
    ('classifier', MultinomialNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', naive_bayes)
])


In [None]:
gs_mnb = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=3)
final_results = train_predict_all(gs_mnb)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  7.8min
[Parallel(n_jobs=3)]: Done  50 out of  50 | elapsed: 12.0min finished
  'stop_words.' % sorted(inconsistent))


Unnamed: 0,rank,params,cv score (mean)
9,1,"{'classifier__classifier__alpha': 1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.782966
7,2,"{'classifier__classifier__alpha': 0.1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.778564
5,3,"{'classifier__classifier__alpha': 0.01, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.759132
8,4,"{'classifier__classifier__alpha': 1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.752286
6,5,"{'classifier__classifier__alpha': 0.1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.745731
4,6,"{'classifier__classifier__alpha': 0.01, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.737958
2,7,"{'classifier__classifier__alpha': 0.0001, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.726656
0,8,"{'classifier__classifier__alpha': 1e-05, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.723352
3,9,"{'classifier__classifier__alpha': 0.0001, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.722207
1,10,"{'classifier__classifier__alpha': 1e-05, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.710515


Max Entropy

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:

max_ent = Pipeline([
    ('classifier', LogisticRegression(penalty='l2', C=1.0)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', max_ent)
])

In [None]:
train_predict(pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

         1.0       0.21      0.20      0.20      6581
         2.0       0.20      0.20      0.20      6641
         3.0       0.20      0.20      0.20      6575
         4.0       0.20      0.19      0.19      6573
         5.0       0.21      0.23      0.22      6572

    accuracy                           0.21     32942
   macro avg       0.20      0.21      0.20     32942
weighted avg       0.20      0.21      0.20     32942

f1 score is 0.20494872650661872, accuracy is 0.2051484427175035


Logistic Regression - Fine Tune

In [None]:
param_grid={"classifier__classifier__C":np.logspace(-3,3,7), "classifier__classifier__penalty":["l1","l2"],
             'vectoriser__tfidfVectoriser__ngram_range': [(1,1),(1,2)]}

logistic_regression = Pipeline([
    ('classifier', LogisticRegression(n_jobs=-1)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', logistic_regression)
])

In [None]:
gs_mnb = GridSearchCV(pipeline, param_grid, cv=5, verbose=2)
results = train_predict_all(gs_mnb)


Fitting 5 folds for each of 28 candidates, totalling 140 fits
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.1s remaining:    0.0s
  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  16.1s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  15.4s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  16.2s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  15.8s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  15.6s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.2min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 
[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.2min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 
[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.2min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 
[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.1min
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.3min
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  27.9s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  27.0s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  27.6s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  26.0s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  25.6s
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.9min
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 2.0min
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 2.1min
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 2.1min
[CV] classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.001, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 2.2min
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  16.0s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  15.8s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  15.8s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  15.9s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  15.7s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.1min
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.1min
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.1min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 
[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.1min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 
[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.2min


  'stop_words.' % sorted(inconsistent))


[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 
[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  45.8s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  41.3s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  45.8s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  41.9s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  44.7s
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 4.3min
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 4.1min
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 4.3min
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 3.3min
[CV] classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.01, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 3.7min
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  14.8s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  14.8s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  14.9s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  14.6s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  14.9s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.0min
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.1min
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.0min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 
[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.0min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 
[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l1, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 1.0min


  'stop_words.' % sorted(inconsistent))


[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 
[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  51.6s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  50.9s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  50.3s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  50.2s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 1), total=  49.6s
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2), total= 7.0min
[CV] classifier__classifier__C=0.1, classifier__classifier__penalty=l2, vectoriser__tfidfVectoriser__ngram_range=(1, 2) 


  'stop_words.' % sorted(inconsistent))




 Decision Tree - Fine Tune

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
param_grid = {"decisiontreeregressor__min_samples_leaf" : [1, 5, 10, 20, 50, 100],
               'vectoriser__tfidfVectoriser__ngram_range': [(1,1),(1,2)]}


decision_tree = Pipeline([
    ('classifier', DecisionTreeClassifier(max_depth=25, n_jobs=-1)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', decision_tree)
])

In [None]:
gs_mnb = GridSearchCV(pipeline, param_grid, cv=5, verbose=2)
results = train_predict_all(gs_mnb)

              precision    recall  f1-score   support

         1.0       0.20      0.20      0.20      6581
         2.0       0.21      0.21      0.21      6641
         3.0       0.19      0.19      0.19      6575
         4.0       0.20      0.20      0.20      6573
         5.0       0.20      0.20      0.20      6572

    accuracy                           0.20     32942
   macro avg       0.20      0.20      0.20     32942
weighted avg       0.20      0.20      0.20     32942

f1 score is 0.2010128486000343, accuracy is 0.2010199745006375


### Bagging Models

Random Forest - Fine Tune

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'classifier__classifier__n_estimators': n_estimators,
               'classifier__classifier__max_features': max_features,
               'classifier__classifier__min_samples_split': min_samples_split,
               'classifier__classifier__min_samples_leaf': min_samples_leaf,
               'classifier__classifier__bootstrap': bootstrap}

random_forest = Pipeline([
    ('classifier', RandomForestClassifier(max_depth=20, n_jobs=-1)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', random_forest)
])

In [None]:
train_predict(pipeline)

              precision    recall  f1-score   support

         1.0       0.20      0.21      0.20      6581
         2.0       0.21      0.20      0.20      6641
         3.0       0.20      0.20      0.20      6575
         4.0       0.20      0.20      0.20      6573
         5.0       0.20      0.20      0.20      6572

    accuracy                           0.20     32942
   macro avg       0.20      0.20      0.20     32942
weighted avg       0.20      0.20      0.20     32942

f1 score is 0.2028050536791965, accuracy is 0.20281100115354259


### Boosting Models



XGBoost - Fine Tune


In [None]:
# !wget https://s3-us-west-2.amazonaws.com/xgboost-wheels/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl
# !pip uninstall xgboost --yes
# !pip install xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl

# For Google Colab Only

In [10]:
from xgboost import XGBClassifier

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')[0m
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')[0m


In [31]:

xgb_clf = XGBClassifier()# = "gpu_hist")
# Create parameter grid
parameters = {"classifier__classifier__learning_rate": [0.1, 0.01, 0.001],
               "classifier__classifier__gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
               "classifier__classifier__max_depth": [2, 4, 7, 10],
               "classifier__classifier__colsample_bytree": [0.3, 0.6, 0.8, 1.0],
               "classifier__classifier__reg_alpha": [0, 0.5, 1],
               "classifier__classifier__reg_lambda": [1, 1.5, 2, 3, 4.5],
               "classifier__classifier__min_child_weight": [1, 3, 5, 7],
               "classifier__classifier__n_estimators": [100, 250, 500, 1000]}


pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', xgb_clf)
])

In [32]:
# gs_mnb = RandomizedSearchCV(pipeline, parameters, scoring = "f1_micro",
#                              cv = 5, verbose = 2)
# results = train_predict_all(gs_mnb)
train_predict(xgb_clf)

  'stop_words.' % sorted(inconsistent))


KeyboardInterrupt: 

AdaBoost


In [33]:
from sklearn.ensemble import AdaBoostClassifier


In [45]:

param_grid = {
 'classifier__classifier__n_estimators': [50, 100],
 'classifier__classifier__learning_rate' : [0.01,0.05,0.1,0.3,1],
 'classifier__classifier__algorithm' : ['SAMME', 'SAMME.R']
 }


adaBoost = Pipeline([
    ('classifier', AdaBoostClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', adaBoost)
])


In [None]:
gs_mnb = RandomizedSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)
results = train_predict_all(gs_mnb)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
train_predict(pipeline)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [44]:
from pprint import pprint
pprint(gs_mnb.estimator.get_params().keys())

dict_keys(['memory', 'steps', 'verbose', 'vectoriser', 'classifier', 'vectoriser__memory', 'vectoriser__steps', 'vectoriser__verbose', 'vectoriser__tfidfVectoriser', 'vectoriser__tfidfVectoriser__analyzer', 'vectoriser__tfidfVectoriser__binary', 'vectoriser__tfidfVectoriser__decode_error', 'vectoriser__tfidfVectoriser__dtype', 'vectoriser__tfidfVectoriser__encoding', 'vectoriser__tfidfVectoriser__input', 'vectoriser__tfidfVectoriser__lowercase', 'vectoriser__tfidfVectoriser__max_df', 'vectoriser__tfidfVectoriser__max_features', 'vectoriser__tfidfVectoriser__min_df', 'vectoriser__tfidfVectoriser__ngram_range', 'vectoriser__tfidfVectoriser__norm', 'vectoriser__tfidfVectoriser__preprocessor', 'vectoriser__tfidfVectoriser__smooth_idf', 'vectoriser__tfidfVectoriser__stop_words', 'vectoriser__tfidfVectoriser__strip_accents', 'vectoriser__tfidfVectoriser__sublinear_tf', 'vectoriser__tfidfVectoriser__token_pattern', 'vectoriser__tfidfVectoriser__tokenizer', 'vectoriser__tfidfVectoriser__use_id