In [166]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, QuantileTransformer, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import recall_score, precision_score, precision_recall_curve, f1_score, multilabel_confusion_matrix

model_df = pd.read_csv('../Data/model_data.csv', index_col = 0)
target_encoding = LabelEncoder()
model_df = model_df.groupby('industry').filter(lambda x : x['industry'].shape[0] >= 4)
y = model_df['industry']
y = target_encoding.fit_transform(y)
model_df = model_df.drop(['industry', 'office'], axis = 1)
df_columns = model_df.columns
X = model_df[df_columns]

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [168]:
# scaler_standard = StandardScaler()

scaler_quantile = QuantileTransformer(output_distribution = 'normal')

pca = PCA()

random_forest = RandomForestClassifier()

features = FeatureUnion([('quantile_scaler', scaler_quantile), ('PCA', pca)])

In [172]:
pipeline = Pipeline([('features', features), ('Random_Forest', random_forest)])
pipeline

In [179]:
pipeline = Pipeline([('features', features), ('Random_Forest', random_forest)])

param_grid_round_1 = {'features__quantile_scaler__output_distribution': ['uniform', 'normal'],
              'features__PCA__n_components': [5, 6, 7, 8]}
            #   'Random_Forest__n_estimators': [50, 100, 200],
            #   'Random_Forest__criterion': ['gini', 'entropy']}

grid_search_round_1 = GridSearchCV(pipeline, param_grid = param_grid_round_1, scoring = 'f1_weighted', verbose = 3, cv = 3)

grid_search_round_1.fit(X_train, y_train)

print(grid_search_round_1.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.187 total time=  28.4s
[CV 2/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.179 total time=  27.5s
[CV 3/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.194 total time=  28.3s
[CV 1/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.187 total time=  28.0s
[CV 2/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.184 total time=  27.4s
[CV 3/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.193 total time=  27.8s
[CV 1/3] END features__PCA__n_components=6, features__quantile_scaler__output_distribution=uniform;, score=0.186 total time=  28.3s
[CV 2/3] END featur

In [183]:
param_grid_round_2 = {'Random_Forest__n_estimators': [50, 100, 200],
                      'Random_Forest__criterion': ['gini', 'entropy']}

In [201]:
best_params = {}

for key, value in grid_search_round_1.best_params_.items():
    value_to_list = [value]
    param_grid_round_2.update({key: value_to_list})

{'features__PCA__n_components': [5],
 'features__quantile_scaler__output_distribution': ['normal']}

In [203]:
best_params_round_1 = grid_search_round_1.best_params_

param_grid_round_2 = {'Random_Forest__n_estimators': [50, 100, 200],
                      'Random_Forest__criterion': ['gini', 'entropy']}

for key, value in grid_search_round_1.best_params_.items():
    value_to_list = [value]
    param_grid_round_2.update({key: value_to_list})

grid_search_round_2 = GridSearchCV(pipeline, param_grid = param_grid_round_2, scoring = 'f1_weighted', verbose = 3, cv = 3)

grid_search_round_2.fit(X_train, y_train)

print(grid_search_round_2.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.177 total time=  14.1s
[CV 2/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.174 total time=  13.9s
[CV 3/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.185 total time=  14.0s
[CV 1/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=100, features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.184 total time=  28.0s
[CV 2/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=100, features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;,

9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "y:\Anaconda\envs\LHLenvironment\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "y:\Anaconda\envs\LHLenvironment\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "y:\Anaconda\envs\LHLenvironment\Lib\site-packages\sklearn\ensemble\_forest.py", line 473, in fit
    trees = Parallel(
            ^^^^^^^^^
  File "y:\Anaconda\envs\LHLenvironment\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call

MemoryError: could not allocate 90439680 bytes

In [180]:
y_pred = pipeline.predict(X_test)


print(recall_score(y_test, y_pred, average = 'weighted'))
print(precision_score(y_test, y_pred, average = 'weighted', zero_division = 0))
print(f1_score(y_test, y_pred, average = 'weighted'))

NotFittedError: This QuantileTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [147]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy = 'stratified')

dummy.fit(X_train, y_train)

dummy_pred = dummy.predict(X_test)

print(recall_score(y_test, dummy_pred, average = 'weighted', zero_division = 0))
print(precision_score(y_test, dummy_pred, average = 'weighted', zero_division = 0))
print(f1_score(y_test, dummy_pred, average = 'weighted'))

0.015493581230633024
0.01538736515568725
0.015408673522316948


In [135]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[4515,    1],
        [   2,    0]],

       [[4492,   12],
        [  12,    2]],

       [[4514,    1],
        [   2,    1]],

       ...,

       [[4514,    2],
        [   1,    1]],

       [[4515,    1],
        [   0,    2]],

       [[4514,    1],
        [   3,    0]]], dtype=int64)