## 3rd Section: Optimization of the chosen algorithm and validation on test set

### Import libraries

In [2]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
# Additional Scikit Learn libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, make_scorer
from sklearn.preprocessing import StandardScaler

In [159]:
# Miscelaneous libraries
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import time
import pickle

#### 3.4 (re) Load dataset and perform one transformation

In [5]:
## Quick function to load the data and make the few transformations
def load_transform(filepath):
    '''
    Quick function to load the data and drop one column
    INPUT:
    filepath of data to be loaded
    OUTPUT:
    dataframe ready for ML Pipeline
        '''
    
    # load dataframe
    df = pd.read_csv(filepath, sep = ";")
    
    # drop column coming from saving date
    df.drop("Unnamed: 0", axis = 1, inplace = True)
    
    return df

In [6]:
# Load 2017 data
df = load_transform("cleaned_data_2017.csv")

#### 3.5 (re) Prepare X, y as well as training and testing set

In [7]:
# Function to prepare training and testing set
def prep_train_test(df, test_size = 0.25):
    '''
    Extract X and y, Split data into training and testing set
    INPUT:
    df: dataframe
    test_size (float): share of testing set 
    OUTPUT:
    X_train, X_test, y_train, y_test
    '''
    
    # Extract X and y
    y = df['rad'].values
    X = df.drop(['rad', 'siren'], axis = 1).values
    
    # Split training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 1)
    
    return X_train, X_test, y_train, y_test
    

In [8]:
# Prepare training and testing set
X_train, X_test, y_train, y_test = prep_train_test(df)

#### 3.6 Model optimization using GridSearchCV

In the previous section we saw that the RandomForestClassifier is having a relatively good performance (aka in our case precision). We will now try to further optimize by tuning the hyperparameters of the algorithm.

In [43]:
# Define the pipeline model with a scaler and a RandomForestClassifier
model = Pipeline([
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier())
])

In [116]:
# New test to try on precision
param_grid_new = {"clf__n_estimators" : [10,20], 
              "clf__criterion": ["gini", "entropy"],
              "clf__max_depth": [None, 5],
              "clf__bootstrap": [True, False],
}

In [117]:
# Instantiate GridSearchCV
model_final_prec = GridSearchCV(model, param_grid_new, verbose = 5, cv=2, scoring = 'precision')
model_final_prec.fit(X_train, y_train)

Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10, score=0.08004778972520908, total=  51.3s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   54.0s remaining:    0.0s


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10, score=0.07981220657276995, total=  45.8s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20, score=0.0886426592797784, total= 1.6min
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.3min remaining:    0.0s


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20, score=0.09117647058823529, total= 1.4min
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.8min remaining:    0.0s


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=  10.0s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10 
[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=  10.3s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20 
[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=  13.8s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20 
[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=  14.5s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10 
[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10, score=0.08870967741935484, total=  48.3s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed: 26.6min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
      ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'clf__n_estimators': [10, 20], 'clf__criterion': ['gini', 'entropy'], 'clf__max_depth': [None, 5], 'clf__bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='precision', verbose=5)

In [119]:
# Print out parameters of best model to avoid having t re-run the exercise
print("Parameters of the best model are: ",model_final_prec.best_params_)

Parameters of the best model are:  {'clf__bootstrap': True, 'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__n_estimators': 20}


In [120]:
# Extract feature importance to analyse which feature is most relevant
importances_new = pd.DataFrame(model_final_prec.best_estimator_.named_steps["clf"].feature_importances_)

In [121]:
# Extract feature names from original dataframe
X_index = df.drop(['rad', 'siren'], axis = 1).columns.transpose()

In [122]:
# Assign new index to importances dataframe
importances_new.index = X_index
importances_new.columns = ["weight"]
feature_importance_new = importances
feature_importance_new.head()

Unnamed: 0,weight
publish,0.016618
delay,0.141137
age,0.225692
num_dept_2,0.003522
num_dept_3,0.0


In [123]:
# Analyse which features are most important (sum is equal to 1)
feature_importance_new = feature_importance_new.sort_values(by = "weight", ascending = False)

In [124]:
# How many make the most of the weight
feature_importance_new[:10].sum()

weight    0.861987
dtype: float64

In [125]:
feature_importance_new[:10]

Unnamed: 0,weight
age,0.225692
legal_form_simple_Societe par actions simplifiee a associe unique,0.151295
delay,0.141137
new_ape_47,0.090943
new_ape_64,0.063907
legal_form_simple_Societe par actions simplifiee,0.054712
new_ape_43,0.043968
new_ape_62,0.038121
num_dept_49,0.027228
num_dept_35,0.024984


**CONCLUSION**

As expected the age of the company play a major role (which we could guess from the exploration phase as most of the companies going out of business were quite young).

The legal form (as a proxy for capital structure and governance) also plays a role.

The so called delay feature is also relevant, e.g. how long does it take a company to publish its results (often if those are bad, it takes longer to publish as it needs more alignment with auditors).

#### 3.7 Model validation

In [126]:
# Predict on test set
y_preds = model_final_prec.predict(X_test)

In [135]:
# Get performance on test set
print("Precision on test set: {} %".format(np.round(precision_score(y_test, y_preds)*100,3)))
print("Accuracy on test set: {} %".format(np.round(accuracy_score(y_test, y_preds)*100,3)))

Precision on test set: 13.002 %
Accuracy on test set: 97.439 %


In [141]:
# Compute performance lift-off vs basic assumption
print("Basic Precision on test set: {} %".format(np.round(y_test.mean()*100,3)))
print("Basic Accuracy on test set: {} %".format(np.round((1-y_test.mean())*100,3)))

Basic Precision on test set: 2.349 %
Basic Accuracy on test set: 97.651 %


In [146]:
# Lift off
perf_lift_off_prec = precision_score(y_test, y_preds) / y_test.mean() - 1
perf_lift_off_acc = accuracy_score(y_test, y_preds) / (1-y_test.mean()) -1

print("Performance lift-off in terms of precision: {} %".format(np.round(perf_lift_off_prec*100,1)))
print("Performance lift-off in terms of accuracy: {} %".format(np.round(perf_lift_off_acc*100,1)))

Performance lift-off in terms of precision: 453.5 %
Performance lift-off in terms of accuracy: -0.2 %


#### 3.8 Test model on other years

This is also interesting to see if the model also works for other years, as we would like to use it to predict on a year for which we don't yet have results.

In [147]:
# Test on 2016 financial information (and 2018 so called radiations)
df_2016 = load_transform("cleaned_data_2016.csv")

In [152]:
# Extract X and y (across the entire set, the entire set is now a test set)
X_2016 = df_2016.drop(['siren', 'rad'], axis = 1)
y_2016 = df_2016['rad']

In [153]:
# check sizes
print(X_2016.shape)
print(y_2016.shape)

(827720, 189)
(827720,)


In [154]:
# Predict using model
y_preds_2016 = model_final_prec.predict(X_2016)

In [156]:
# Test on entire set
print("Precision on 2016 data is then: {} %".format(np.round(precision_score(y_2016, y_preds_2016)*100,3)))
print("Accuracy on 2016 data is then: {} %".format(np.round(accuracy_score(y_2016, y_preds_2016)*100,3)))

Precision on 2016 data is then: 3.168 %
Accuracy on 2016 data is then: 97.079 %


In [157]:
# Compute performance lift-off vs basic assumption
print("Basic Precision on test set: {} %".format(np.round(y_2016.mean()*100,3)))
print("Basic Accuracy on test set: {} %".format(np.round((1-y_2016.mean())*100,3)))

Basic Precision on test set: 2.667 %
Basic Accuracy on test set: 97.333 %


In [158]:
# Lift off
perf_lift_off_prec_2016 = precision_score(y_2016, y_preds_2016) / y_2016.mean() - 1
perf_lift_off_acc_2016 = accuracy_score(y_2016, y_preds_2016) / (1-y_2016.mean()) - 1

print("Performance lift-off in terms of precision: {} %".format(np.round(perf_lift_off_prec_2016*100,1)))
print("Performance lift-off in terms of accuracy: {} %".format(np.round(perf_lift_off_acc_2016*100,1)))

Performance lift-off in terms of precision: 18.8 %
Performance lift-off in terms of accuracy: -0.3 %


#### 3.9 Save model

for potential reuse without having to re-run it all

In [160]:
# Version 1
filename = 'finalized_model_2017-19.sav'
pickle.dump(model_final_prec, open(filename, 'wb'))

In [161]:
# Version 2
pkl_filename = "pickle_model_2017-19.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model_final_prec, file)
    
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
'''

"\n# Load from file\nwith open(pkl_filename, 'rb') as file:\n    pickle_model = pickle.load(file)\n    \n"