In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.preprocessing import MinMaxScaler
from importlib import reload
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import acquire as a
import prepare as p
from nltk.sentiment import SentimentIntensityAnalyzer
import model as m
import nltk

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
train, validate, test = p.prepare_michelin(a.get_michelin_pages())

In [3]:

dt_params = {
    'max_depth':[2,3,4,5]
}
rf_params = {
    'n_estimators':[50,100,150],
    'max_depth':[3,4,5],
    'min_samples_leaf':[3,4],
    'random_state':[27]
}
xg_params = {
    'n_estimators':[50,100,150],
    'max_depth':[3,4,5],
    'min_samples_leaf':[3,4],
    'random_state': [27]
}
lr_params = {
    'solver':['liblinear'],
    'penalty':['l1','l2'],
    'C':[1.0,0.5,.05,.1],
    'random_state':[27]
}


In [4]:
train_validate = pd.concat([train, validate]).sort_index()
tfidf = TfidfVectorizer()
scaler = MinMaxScaler()
trainx, trainy = m.get_features_and_target(train_validate,scaler, tfidf)

In [5]:
model = GradientBoostingClassifier()
m.tune_model(model,trainx,trainy,xg_params)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 4.6min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 4.7min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 4.8min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 4.8min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 4.8min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; total time=10.6min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; total time=10.8min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; total time=11.1min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; total time=11.0min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; tot

{'max_depth': 4, 'min_samples_leaf': 4, 'n_estimators': 50, 'random_state': 27}

In [11]:
model = DecisionTreeClassifier()
m.tune_model(model,trainx,trainy,dt_params)
#already run. max_depth of 2 performs best

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ........................................max_depth=2; total time=  19.3s
[CV] END ........................................max_depth=2; total time=  19.4s
[CV] END ........................................max_depth=3; total time=  16.1s
[CV] END ........................................max_depth=3; total time=  16.1s
[CV] END ........................................max_depth=2; total time=  37.6s
[CV] END ........................................max_depth=2; total time=  37.6s
[CV] END ........................................max_depth=2; total time=  37.5s
[CV] END ........................................max_depth=3; total time=  15.4s
[CV] END ........................................max_depth=3; total time=  15.6s
[CV] END ........................................max_depth=4; total time=  15.8s
[CV] END ........................................max_depth=4; total time=  16.0s
[CV] END ........................................

{'max_depth': 2}

In [9]:
model = RandomForestClassifier()
m.tune_model(model,trainx,trainy,rf_params)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 1.7min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 1.7min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 1.7min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 1.7min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=50, random_state=27; total time= 1.7min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; total time= 2.9min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; total time= 2.9min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; total time= 2.9min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; total time= 2.9min
[CV] END max_depth=3, min_samples_leaf=3, n_estimators=100, random_state=27; tot

{'max_depth': 5, 'min_samples_leaf': 3, 'n_estimators': 50, 'random_state': 27}

In [10]:
model = LogisticRegression()
m.tune_model(model,trainx,trainy,lr_params)

Fitting 5 folds for each of 8 candidates, totalling 40 fits




[CV] END C=1.0, penalty=l1, random_state=27, solver=liblinear; total time=  30.2s
[CV] END C=1.0, penalty=l1, random_state=27, solver=liblinear; total time=  30.4s
[CV] END C=1.0, penalty=l2, random_state=27, solver=liblinear; total time=  26.9s
[CV] END C=1.0, penalty=l2, random_state=27, solver=liblinear; total time=  27.0s
[CV] END C=1.0, penalty=l2, random_state=27, solver=liblinear; total time=  24.8s
[CV] END C=1.0, penalty=l2, random_state=27, solver=liblinear; total time=  24.8s
[CV] END C=1.0, penalty=l1, random_state=27, solver=liblinear; total time=  56.4s
[CV] END C=1.0, penalty=l1, random_state=27, solver=liblinear; total time=  57.8s
[CV] END C=1.0, penalty=l1, random_state=27, solver=liblinear; total time=  58.0s
[CV] END C=1.0, penalty=l2, random_state=27, solver=liblinear; total time=  23.2s
[CV] END C=0.5, penalty=l1, random_state=27, solver=liblinear; total time=  24.0s
[CV] END C=0.5, penalty=l2, random_state=27, solver=liblinear; total time=  23.6s
[CV] END C=0.5, 

{'C': 0.05, 'penalty': 'l1', 'random_state': 27, 'solver': 'liblinear'}