In [166]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, QuantileTransformer, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import recall_score, precision_score, precision_recall_curve, f1_score, multilabel_confusion_matrix

model_df = pd.read_csv('../Data/model_data.csv', index_col = 0)
target_encoding = LabelEncoder()
model_df = model_df.groupby('industry').filter(lambda x : x['industry'].shape[0] >= 4)
y = model_df['industry']
y = target_encoding.fit_transform(y)
model_df = model_df.drop(['industry', 'office'], axis = 1)
df_columns = model_df.columns
X = model_df[df_columns]

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [168]:
# scaler_standard = StandardScaler()

scaler_quantile = QuantileTransformer(output_distribution = 'normal')

pca = PCA()

random_forest = RandomForestClassifier()

features = FeatureUnion([('quantile_scaler', scaler_quantile), ('PCA', pca)])

In [172]:
pipeline = Pipeline([('features', features), ('Random_Forest', random_forest)])
pipeline

In [175]:
param_grid = {'features__quantile_scaler__output_distribution': ['uniform', 'normal'],
              'features__PCA__n_components': [5, 6, 7, 8],
              'Random_Forest__n_estimators': [50, 100, 200],
              'Random_Forest__criterion': ['gini', 'entropy']}

In [177]:
grid_search = GridSearchCV(pipeline, param_grid = param_grid, scoring = 'f1_weighted', verbose = 3, cv = 3)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV 1/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.172 total time=  14.1s
[CV 2/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.174 total time=  14.0s
[CV 3/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.184 total time=  14.4s
[CV 1/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.176 total time=  14.2s
[CV 2/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=norma

In [145]:
pipeline.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing features, total=   0.0s
[Pipeline] ..... (step 2 of 2) Processing random_forest, total=  55.0s


In [146]:
y_pred = pipeline.predict(X_test)


print(recall_score(y_test, y_pred, average = 'weighted'))
print(precision_score(y_test, y_pred, average = 'weighted', zero_division = 0))
print(f1_score(y_test, y_pred, average = 'weighted'))

0.22620628596724215
0.23193468037553674
0.218253861613946


In [147]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy = 'stratified')

dummy.fit(X_train, y_train)

dummy_pred = dummy.predict(X_test)

print(recall_score(y_test, dummy_pred, average = 'weighted', zero_division = 0))
print(precision_score(y_test, dummy_pred, average = 'weighted', zero_division = 0))
print(f1_score(y_test, dummy_pred, average = 'weighted'))

0.015493581230633024
0.01538736515568725
0.015408673522316948


In [135]:
multilabel_confusion_matrix(y_test, y_pred)

array([[[4515,    1],
        [   2,    0]],

       [[4492,   12],
        [  12,    2]],

       [[4514,    1],
        [   2,    1]],

       ...,

       [[4514,    2],
        [   1,    1]],

       [[4515,    1],
        [   0,    2]],

       [[4514,    1],
        [   3,    0]]], dtype=int64)