In [57]:
# Conducting preliminary steps such as importing modules and reading in the data.

import pandas as pd
import pickle
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import recall_score, precision_score, f1_score, multilabel_confusion_matrix

# Data is imported and split, with the target variable being numerically encoded.
model_df = pd.read_csv('../Data/model_data.csv', index_col = 0)
target_encoding = LabelEncoder()
model_df = model_df.groupby('industry').filter(lambda x : x['industry'].shape[0] >= 4)
print(f'Number of unique industries for classification : {len(model_df["industry"].unique())}')
y = model_df['industry']
y = target_encoding.fit_transform(y)
model_df = model_df.drop(['industry', 'office'], axis = 1)
df_columns = model_df.columns
X = model_df[df_columns]

Number of unique industries for classification : 345


In [2]:
# Splitting the data into training and test sets.
# The data segments will be stratified along the target variable, and so the distribution of industries will remain consistent between testing and training.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [3]:
# Initializing features to be used in the pipeline.
# QuantileTransformer will be used to scale the numeric variables.
# Principal Component Analysis will allow some of the less important features to be identified and stripped.
# Random Forest will the model trained and tested on the data.

scaler_quantile = QuantileTransformer(output_distribution = 'normal')

pca = PCA()

random_forest = RandomForestClassifier()

features = FeatureUnion([('quantile_scaler', scaler_quantile), ('PCA', pca)])

In [4]:
# Initializing the pipeline with the previously created features.

pipeline = Pipeline([('features', features), ('Random_Forest', random_forest)])
pipeline

In [14]:
# Running the first round of grid search for the ideal parameters for PCA.
# The grid search process is split into several rounds to save what would otherwise be an overwhelming amount of computational power.

pipeline = Pipeline([('features', features), ('Random_Forest', random_forest)])

param_grid_round_1 = {'features__quantile_scaler__output_distribution': ['uniform', 'normal'],
              'features__PCA__n_components': [5, 6, 7, 8],
              'Random_Forest__n_jobs': [3]}


grid_search_round_1 = GridSearchCV(pipeline, param_grid = param_grid_round_1, scoring = 'f1_weighted', verbose = 3, cv = 3)

grid_search_round_1.fit(X_train, y_train)

pickle.dump(grid_search_round_1, open('./Model_Files_Pickle/grid_search_round_1.pickle', 'wb'))

print(grid_search_round_1.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.187 total time=  27.7s
[CV 2/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.185 total time=  27.2s
[CV 3/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.190 total time=  27.7s
[CV 1/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.183 total time=  27.8s
[CV 2/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.184 total time=  27.4s
[CV 3/3] END features__PCA__n_components=5, features__quantile_scaler__output_distribution=normal;, score=0.195 total time=  27.4s
[CV 1/3] END features__PCA__n_components=6, features__quantile_scaler__output_distribution=uniform;, score=0.182 total time=  28.2s
[CV 2/3] END featur

In [15]:
# The second round will focus on the important parameters for the Random Forest model.

# grid_search_round_1 = pickle.load(open('./Model_Files_Pickle/grid_search_round_1.pickle', 'rb'))

best_params_round_1 = grid_search_round_1.best_params_

param_grid_round_2 = {'Random_Forest__n_estimators': [50, 100, 200],
                      'Random_Forest__criterion': ['gini', 'entropy'],
                      'Random_Forest__n_jobs': [3]}

for key, value in grid_search_round_1.best_params_.items():
    value_to_list = [value]
    param_grid_round_2.update({key: value_to_list})

grid_search_round_2 = GridSearchCV(pipeline, param_grid = param_grid_round_2, scoring = 'f1_weighted', verbose = 3, cv = 3)

grid_search_round_2.fit(X_train, y_train)

pickle.dump(grid_search_round_2, open('./Model_Files_Pickle/grid_search_round_2.pickle', 'wb'))

print(grid_search_round_2.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.175 total time=  13.9s
[CV 2/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.174 total time=  13.7s
[CV 3/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=50, features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.186 total time=  13.8s
[CV 1/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=100, features__PCA__n_components=5, features__quantile_scaler__output_distribution=uniform;, score=0.183 total time=  27.9s
[CV 2/3] END Random_Forest__criterion=gini, Random_Forest__n_estimators=100, features__PCA__n_components=5, features__quantile_scaler__output_distribution=unif

In [5]:
# The third round of the grid search is run, maintaining several optimal parameters from earlier rounds.
# This round will rerun previously covered settings because several of the determined optimal parameters were at the bounds of the defined range.  

# grid_search_round_2 = pickle.load(open('./Model_Files_Pickle/grid_search_round_2.pickle', 'rb'))

param_grid_round_3 = {'Random_Forest__n_estimators': [200, 250, 300],
                      'Random_Forest__criterion': ['entropy'],
                      'Random_Forest__n_jobs': [3],
                      'features__quantile_scaler__output_distribution': ['uniform'],
                      'features__PCA__n_components': [3, 4, 5]}

grid_search_round_3 = GridSearchCV(pipeline, param_grid = param_grid_round_3, scoring = 'f1_weighted', verbose = 3, cv = 3)

grid_search_round_3.fit(X_train, y_train)

pickle.dump(grid_search_round_3, open('./Model_Files_Pickle/grid_search_round_3.pickle', 'wb'))

print(grid_search_round_3.best_params_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END Random_Forest__criterion=entropy, Random_Forest__n_estimators=200, Random_Forest__n_jobs=3, features__PCA__n_components=3, features__quantile_scaler__output_distribution=uniform;, score=0.169 total time=  52.0s
[CV 2/3] END Random_Forest__criterion=entropy, Random_Forest__n_estimators=200, Random_Forest__n_jobs=3, features__PCA__n_components=3, features__quantile_scaler__output_distribution=uniform;, score=0.193 total time=  50.8s
[CV 3/3] END Random_Forest__criterion=entropy, Random_Forest__n_estimators=200, Random_Forest__n_jobs=3, features__PCA__n_components=3, features__quantile_scaler__output_distribution=uniform;, score=0.203 total time=  50.6s
[CV 1/3] END Random_Forest__criterion=entropy, Random_Forest__n_estimators=200, Random_Forest__n_jobs=3, features__PCA__n_components=4, features__quantile_scaler__output_distribution=uniform;, score=0.165 total time=  53.2s
[CV 2/3] END Random_Forest__criterion=entrop

OSError: [Errno 28] No space left on device

In [47]:
# The model is run once more with the best parameters determined by the grid search.

# grid_search_round_3 = pickle.load(open('./Model_Files_Pickle/grid_search_round_3.pickle', 'rb'))

final_params = grid_search_round_3.best_params_

final_pipeline = grid_search_round_3.best_estimator_

final_pipeline.fit(X_train, y_train)

y_pred = final_pipeline.predict(X_test)

print(f'Precision Score : {precision_score(y_test, y_pred, average = "weighted", zero_division = 0)}')
print(f'Recall Score : {recall_score(y_test, y_pred, average = "weighted")}')
print(f'F1 Score : {f1_score(y_test, y_pred, average = "weighted")}')

Precision Score : 0.24388087138496128
Recall Score : 0.24451827242524918
F1 Score : 0.23616679502604565


In [45]:
# These scores provide a baseline from which to compare any models.
# The DummyClassifier is naive, guessing mostly randomly, only taking the frequency of the industries into account for its predictions.
# This is realistically the lowest score that a genuine model would output.
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy = 'stratified')

dummy.fit(X_train, y_train)

dummy_pred = dummy.predict(X_test)

print(f'Precision Score : {precision_score(y_test, dummy_pred, average = "weighted", zero_division = 0)}')
print(f'Recall Score : {recall_score(y_test, dummy_pred, average = "weighted")}')
print(f'F1 Score : {f1_score(y_test, dummy_pred, average = "weighted")}')

Precision Score : 0.016284361333351653
Recall Score : 0.016389811738648948
F1 Score : 0.016294857731529073


In [46]:
# This is a vanilla Random Forest model for score comparison.

vanilla = RandomForestClassifier()

vanilla.fit(X_train, y_train)

vanilla_pred = vanilla.predict(X_test)

print(f'Precision Score : {precision_score(y_test, vanilla_pred, average = "weighted", zero_division = 0)}')
print(f'Recall Score : {recall_score(y_test, vanilla_pred, average = "weighted")}')
print(f'F1 Score : {f1_score(y_test, vanilla_pred, average = "weighted")}')

Precision Score : 0.2160195781693903
Recall Score : 0.21461794019933556
F1 Score : 0.20580463480091718


In [42]:
multilabel_confusion_matrix(y_test, y_pred)
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

array([[[4512,    1],
        [   2,    0]],

       [[4495,    6],
        [  13,    1]],

       [[4510,    2],
        [   2,    1]],

       ...,

       [[4511,    2],
        [   1,    1]],

       [[4513,    0],
        [   0,    2]],

       [[4510,    2],
        [   3,    0]]], dtype=int64)