In [0]:
import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline as imPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.base import TransformerMixin
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from IPython.display import display, HTML
from scipy.stats import uniform




In [0]:
from google.colab import drive
drive.mount('/content/drive')
root_path = "/content/drive/My Drive/notebooks"
# For Google colab only

df = pd.read_csv(f"{root_path}/new_clean_sm_100000.csv")
df = df[df['reviewText'].notna()]
df = df[~df['reviewText'].str.contains(".jpg|.png|.jpeg|.tiff|.gif|.bmp|.heif", regex=True, na=False)]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Split data

In [0]:
### Convert to a two class problem -  optional

df = df[df['overall'] != 2]
df = df[df['overall'] != 4]
df.loc[df['overall'] == 1, 'overall'] = 0
df.loc[df['overall'] == 3, 'overall'] = 1
df.loc[df['overall'] == 5, 'overall'] = 2

In [0]:
X = df.reviewText.values
y = df.overall.values

In [0]:
num = len(df)
X, y  = X[:num], y[:num]
# le = LabelEncoder()
# y = le.fit_transform(y)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [0]:
def train_predict(pipline):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = classification_report(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    print(score)
    print(f"f1 score is {f1}, accuracy is {accuracy}")

def train_predict_all(pipeline):
    out = pipeline.fit(X, y).cv_results_
    results_df = pd.DataFrame({'rank': out['rank_test_score'],
                          'params': out['params'],
                           'cv score (mean)': out['mean_test_score']})
    results_df = results_df.sort_values(by=['rank'], ascending=True)
    pd.set_option('display.max_colwidth',100)
    display(HTML(results_df.to_html()))
    return out
    # score = classification_report(y_test, y_pred)
    # f1 = f1_score(y_test, y_pred, average='macro')
    # accuracy = accuracy_score(y_test, y_pred)
    # print(score)
    # print(f"f1 score is {f1}, accuracy is {accuracy}")


Vectorizers


In [0]:


count_vectoriser = Pipeline([
                ('countVectoriser', CountVectorizer())
            ])

tfidf_vectoriser = Pipeline([
                ('tfidfVectoriser', TfidfVectorizer(stop_words=STOP_WORDS, ngram_range = (1,2)
                                                    ))
            ])


### Stantard Models

Naive Bayes -  Fine Tune


In [0]:
param_grid = {'vectoriser__tfidfVectoriser__ngram_range': [(1,1),(1,2)],
              'classifier__classifier__alpha': [1e-5, 1e-4, 1e-2, 1e-1,1]}


# param_grid = {'vectoriser__tfidfVectoriser__ngram_range': [(1,1)],
#               'classifier__classifier__alpha': [1e-5]}

naive_bayes = Pipeline([
    ('classifier', MultinomialNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', naive_bayes)
])


In [0]:
gs_mnb = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=3)
final_results = train_predict_all(gs_mnb)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  7.8min
[Parallel(n_jobs=3)]: Done  50 out of  50 | elapsed: 12.0min finished
  'stop_words.' % sorted(inconsistent))


Unnamed: 0,rank,params,cv score (mean)
9,1,"{'classifier__classifier__alpha': 1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.782966
7,2,"{'classifier__classifier__alpha': 0.1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.778564
5,3,"{'classifier__classifier__alpha': 0.01, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.759132
8,4,"{'classifier__classifier__alpha': 1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.752286
6,5,"{'classifier__classifier__alpha': 0.1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.745731
4,6,"{'classifier__classifier__alpha': 0.01, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.737958
2,7,"{'classifier__classifier__alpha': 0.0001, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.726656
0,8,"{'classifier__classifier__alpha': 1e-05, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.723352
3,9,"{'classifier__classifier__alpha': 0.0001, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.722207
1,10,"{'classifier__classifier__alpha': 1e-05, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.710515


In [0]:
from sklearn.linear_model import LogisticRegression

Logistic Regression - Fine Tune

In [0]:


param_grid = {'vectoriser__tfidfVectoriser__ngram_range': [(1,1),(1,2)],
              'classifier__classifier__C': np.logspace(0, 4, num=3),
              'classifier__classifier__penalty': ['l1', 'l2']}

logistic_regression = Pipeline([
    ('classifier', LogisticRegression(n_jobs=-1)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', logistic_regression)
])

In [0]:
gs_mnb = RandomizedSearchCV(pipeline, param_grid, cv=5, verbose=2)
results = train_predict_all(gs_mnb)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0, total=  51.8s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.8s remaining:    0.0s
  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0, total=  51.4s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0, total=  51.5s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0, total=  51.1s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=100.0, total=  50.1s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total= 1.0min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 
[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total= 1.0min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total= 1.0min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total= 1.0min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 
[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total= 1.0min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 
[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total=  15.6s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total=  15.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total=  15.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total=  15.6s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total=  15.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total=  15.5s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total=  15.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total=  15.5s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total=  15.6s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=100.0, total=  15.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 
[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total= 6.8min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total= 6.7min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total= 6.8min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total= 6.7min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total= 6.8min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total= 1.0min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total=  51.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total=  51.9s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total=  53.1s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=1.0, total= 1.0min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total=  51.3s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total=  52.0s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total=  51.2s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total= 1.0min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total=  51.2s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0, total=  15.6s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0, total=  15.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0, total=  15.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0, total=  15.7s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 1), classifier__classifier__penalty=l1, classifier__classifier__C=10000.0, total=  15.8s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total= 1.1min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total= 1.0min


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 
[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total= 1.0min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total=  58.9s
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0 


  'stop_words.' % sorted(inconsistent))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l1, classifier__classifier__C=1.0, total= 1.0min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 
[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total= 6.7min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total= 6.7min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total= 6.8min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total= 6.7min
[CV] vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0 


  'stop_words.' % sorted(inconsistent))


[CV]  vectoriser__tfidfVectoriser__ngram_range=(1, 2), classifier__classifier__penalty=l2, classifier__classifier__C=10000.0, total= 6.7min


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 95.1min finished
  'stop_words.' % sorted(inconsistent))


Unnamed: 0,rank,params,cv score (mean)
4,1,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 2), 'classifier__classifier__penalty': 'l2', 'classifier__classifier__C': 1.0}",0.800092
9,2,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 2), 'classifier__classifier__penalty': 'l2', 'classifier__classifier__C': 10000.0}",0.789472
5,3,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 1), 'classifier__classifier__penalty': 'l2', 'classifier__classifier__C': 1.0}",0.787372
0,4,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 1), 'classifier__classifier__penalty': 'l2', 'classifier__classifier__C': 100.0}",0.780099
6,5,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 1), 'classifier__classifier__penalty': 'l2', 'classifier__classifier__C': 10000.0}",0.778698
1,6,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 2), 'classifier__classifier__penalty': 'l1', 'classifier__classifier__C': 100.0}",
2,7,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 1), 'classifier__classifier__penalty': 'l1', 'classifier__classifier__C': 1.0}",
3,8,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 1), 'classifier__classifier__penalty': 'l1', 'classifier__classifier__C': 100.0}",
7,9,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 1), 'classifier__classifier__penalty': 'l1', 'classifier__classifier__C': 10000.0}",
8,10,"{'vectoriser__tfidfVectoriser__ngram_range': (1, 2), 'classifier__classifier__penalty': 'l1', 'classifier__classifier__C': 1.0}",




 Decision Tree - Fine Tune

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
param_grid = {"classifier__classifier__min_samples_leaf" : [1, 5, 10, 20, 50, 100],
               'vectoriser__tfidfVectoriser__ngram_range': [(1,1),(1,2)]}


decision_tree = Pipeline([
    ('classifier', DecisionTreeClassifier(max_depth=25)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', decision_tree)
])

In [0]:
gs_mnb = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)
results = train_predict_all(gs_mnb)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 169.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 250.2min finished
  'stop_words.' % sorted(inconsistent))


Unnamed: 0,rank,params,cv score (mean)
9,1,"{'classifier__classifier__min_samples_leaf': 50, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.634169
7,2,"{'classifier__classifier__min_samples_leaf': 20, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.633832
11,3,"{'classifier__classifier__min_samples_leaf': 100, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.632276
6,4,"{'classifier__classifier__min_samples_leaf': 20, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.632143
8,5,"{'classifier__classifier__min_samples_leaf': 50, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.632033
1,6,"{'classifier__classifier__min_samples_leaf': 1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.631392
10,7,"{'classifier__classifier__min_samples_leaf': 100, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.630971
5,8,"{'classifier__classifier__min_samples_leaf': 10, 'vectoriser__tfidfVectoriser__ngram_range': (1, 2)}",0.630784
0,9,"{'classifier__classifier__min_samples_leaf': 1, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.629336
4,10,"{'classifier__classifier__min_samples_leaf': 10, 'vectoriser__tfidfVectoriser__ngram_range': (1, 1)}",0.628461


### Bagging Models

Random Forest - Fine Tune

In [0]:
import numpy as np
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
param_grid = {'classifier__classifier__n_estimators': n_estimators,
               'classifier__classifier__max_features': max_features,
               'classifier__classifier__min_samples_split': min_samples_split,
               'classifier__classifier__min_samples_leaf': min_samples_leaf,
               'classifier__classifier__bootstrap': bootstrap}

random_forest = Pipeline([
    ('classifier', RandomForestClassifier(max_depth=20, n_jobs=-1)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', random_forest)
])

In [0]:
gs_mnb = RandomizedSearchCV(pipeline, param_grid, cv=5, verbose=2)
results = train_predict_all(gs_mnb)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False, total= 4.0min
[CV] classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.0min remaining:    0.0s
  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False, total= 4.0min
[CV] classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False, total= 4.0min
[CV] classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False, total= 4.0min
[CV] classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1800, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=False, total= 4.0min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.0min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.0min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.0min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.0min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.0min
[CV] classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True, total= 3.2min
[CV] classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True, total= 3.2min
[CV] classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True, total= 3.2min
[CV] classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True, total= 3.2min
[CV] classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1400, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=sqrt, classifier__classifier__bootstrap=True, total= 3.2min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.9min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.9min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.9min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.9min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.9min
[CV] classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.8min
[CV] classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.8min
[CV] classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.8min
[CV] classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.8min
[CV] classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1200, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.8min
[CV] classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 1.7min
[CV] classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 1.7min
[CV] classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 1.7min
[CV] classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 1.7min
[CV] classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 1.7min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.7min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.7min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.7min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.7min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=4, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 2.7min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.6min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.6min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.7min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.6min
[CV] classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=1600, classifier__classifier__min_samples_split=5, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 3.7min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 4.6min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 4.5min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 4.5min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 4.4min
[CV] classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=2000, classifier__classifier__min_samples_split=2, classifier__classifier__min_samples_leaf=2, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=False, total= 4.5min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.1min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.1min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.1min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.1min
[CV] classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True 


  'stop_words.' % sorted(inconsistent))


[CV]  classifier__classifier__n_estimators=400, classifier__classifier__min_samples_split=10, classifier__classifier__min_samples_leaf=1, classifier__classifier__max_features=auto, classifier__classifier__bootstrap=True, total= 1.1min


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 143.5min finished
  'stop_words.' % sorted(inconsistent))


Unnamed: 0,rank,params,cv score (mean)
4,1,"{'classifier__classifier__n_estimators': 1200, 'classifier__classifier__min_samples_split': 5, 'classifier__classifier__min_samples_leaf': 1, 'classifier__classifier__max_features': 'auto', 'classifier__classifier__bootstrap': True}",0.727314
2,2,"{'classifier__classifier__n_estimators': 1400, 'classifier__classifier__min_samples_split': 5, 'classifier__classifier__min_samples_leaf': 1, 'classifier__classifier__max_features': 'sqrt', 'classifier__classifier__bootstrap': True}",0.727137
7,3,"{'classifier__classifier__n_estimators': 1600, 'classifier__classifier__min_samples_split': 5, 'classifier__classifier__min_samples_leaf': 2, 'classifier__classifier__max_features': 'auto', 'classifier__classifier__bootstrap': False}",0.726943
3,4,"{'classifier__classifier__n_estimators': 2000, 'classifier__classifier__min_samples_split': 2, 'classifier__classifier__min_samples_leaf': 4, 'classifier__classifier__max_features': 'auto', 'classifier__classifier__bootstrap': False}",0.72689
6,5,"{'classifier__classifier__n_estimators': 1600, 'classifier__classifier__min_samples_split': 10, 'classifier__classifier__min_samples_leaf': 4, 'classifier__classifier__max_features': 'auto', 'classifier__classifier__bootstrap': True}",0.72679
8,6,"{'classifier__classifier__n_estimators': 2000, 'classifier__classifier__min_samples_split': 2, 'classifier__classifier__min_samples_leaf': 2, 'classifier__classifier__max_features': 'auto', 'classifier__classifier__bootstrap': False}",0.726486
0,7,"{'classifier__classifier__n_estimators': 1800, 'classifier__classifier__min_samples_split': 2, 'classifier__classifier__min_samples_leaf': 2, 'classifier__classifier__max_features': 'sqrt', 'classifier__classifier__bootstrap': False}",0.726362
5,8,"{'classifier__classifier__n_estimators': 600, 'classifier__classifier__min_samples_split': 10, 'classifier__classifier__min_samples_leaf': 1, 'classifier__classifier__max_features': 'auto', 'classifier__classifier__bootstrap': False}",0.725364
9,9,"{'classifier__classifier__n_estimators': 400, 'classifier__classifier__min_samples_split': 10, 'classifier__classifier__min_samples_leaf': 1, 'classifier__classifier__max_features': 'auto', 'classifier__classifier__bootstrap': True}",0.723939
1,10,"{'classifier__classifier__n_estimators': 400, 'classifier__classifier__min_samples_split': 2, 'classifier__classifier__min_samples_leaf': 2, 'classifier__classifier__max_features': 'auto', 'classifier__classifier__bootstrap': True}",0.723399


### Boosting Models



XGBoost - Fine Tune


In [0]:
# !wget https://s3-us-west-2.amazonaws.com/xgboost-wheels/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl
# !pip uninstall xgboost --yes
# !pip install xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl

# For Google Colab Only

In [0]:
from xgboost import XGBClassifier

In [0]:

param_grid = {
        'classifier__classifier__colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'classifier__classifier__colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'classifier__classifier__gamma': [0, 0.25, 0.5, 1.0]}


xg_boost = Pipeline([
    ('classifier', XGBClassifier(objective = 'multi:softmax', num_class=  3, tree_method='exact', subsample=0.1 ))
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', xg_boost)
])


In [0]:
rs_clf = RandomizedSearchCV(pipeline, param_grid, n_iter=10,
                            n_jobs=1, verbose=2, cv=5,
                            scoring='neg_log_loss', refit=False, random_state=42)
results = train_predict_all(rs_clf)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] classifier__classifier__gamma=1.0, classifier__classifier__colsample_bytree=1.0, classifier__classifier__colsample_bylevel=0.8 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))


KeyboardInterrupt: ignored

AdaBoost


In [0]:
from sklearn.ensemble import AdaBoostClassifier


In [0]:

adaBoost = Pipeline([
    ('classifier', AdaBoostClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', adaBoost)
])


In [0]:
train_predict(pipeline)
results = train_predict_all(gs_mnb)

              precision    recall  f1-score   support

         1.0       0.19      0.05      0.08      6581
         2.0       0.21      0.09      0.12      6641
         3.0       0.20      0.71      0.31      6575
         4.0       0.20      0.03      0.06      6573
         5.0       0.22      0.13      0.17      6572

    accuracy                           0.20     32942
   macro avg       0.21      0.20      0.15     32942
weighted avg       0.21      0.20      0.15     32942

f1 score is 0.1455947998473248, accuracy is 0.2014449638759031


In [0]:
train_predict(pipeline)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
from pprint import pprint
pprint(gs_mnb.estimator.get_params().keys())

dict_keys(['memory', 'steps', 'verbose', 'vectoriser', 'classifier', 'vectoriser__memory', 'vectoriser__steps', 'vectoriser__verbose', 'vectoriser__tfidfVectoriser', 'vectoriser__tfidfVectoriser__analyzer', 'vectoriser__tfidfVectoriser__binary', 'vectoriser__tfidfVectoriser__decode_error', 'vectoriser__tfidfVectoriser__dtype', 'vectoriser__tfidfVectoriser__encoding', 'vectoriser__tfidfVectoriser__input', 'vectoriser__tfidfVectoriser__lowercase', 'vectoriser__tfidfVectoriser__max_df', 'vectoriser__tfidfVectoriser__max_features', 'vectoriser__tfidfVectoriser__min_df', 'vectoriser__tfidfVectoriser__ngram_range', 'vectoriser__tfidfVectoriser__norm', 'vectoriser__tfidfVectoriser__preprocessor', 'vectoriser__tfidfVectoriser__smooth_idf', 'vectoriser__tfidfVectoriser__stop_words', 'vectoriser__tfidfVectoriser__strip_accents', 'vectoriser__tfidfVectoriser__sublinear_tf', 'vectoriser__tfidfVectoriser__token_pattern', 'vectoriser__tfidfVectoriser__tokenizer', 'vectoriser__tfidfVectoriser__use_id