## 3rd Section: Optimization of the chosen algorithm and validation on test set

### Import libraries

In [1]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Additional Scikit Learn libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, make_scorer
from sklearn.preprocessing import StandardScaler

In [3]:
# Miscelaneous libraries
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import time
import pickle

#### 3.4 (re) Load dataset and perform one transformation

In [4]:
## Quick function to load the data and make the few transformations
def load_transform(filepath):
    '''
    Quick function to load the data and drop one column
    INPUT:
    filepath of data to be loaded
    OUTPUT:
    dataframe ready for ML Pipeline
        '''
    
    # load dataframe
    df = pd.read_csv(filepath, sep = ";")
    
    # drop column coming from saving date
    df.drop("Unnamed: 0", axis = 1, inplace = True)
    
    return df

In [5]:
# Load 2017 data
df = load_transform("cleaned_data_2016.csv")

In [6]:
np.array(df.columns)

array(['siren', 'publish', 'delay', 'rad', 'age', 'num_dept_2',
       'num_dept_3', 'num_dept_4', 'num_dept_5', 'num_dept_6',
       'num_dept_7', 'num_dept_8', 'num_dept_9', 'num_dept_10',
       'num_dept_11', 'num_dept_12', 'num_dept_13', 'num_dept_14',
       'num_dept_15', 'num_dept_16', 'num_dept_17', 'num_dept_18',
       'num_dept_19', 'num_dept_20', 'num_dept_21', 'num_dept_22',
       'num_dept_23', 'num_dept_24', 'num_dept_25', 'num_dept_26',
       'num_dept_27', 'num_dept_28', 'num_dept_29', 'num_dept_30',
       'num_dept_31', 'num_dept_32', 'num_dept_33', 'num_dept_34',
       'num_dept_35', 'num_dept_36', 'num_dept_37', 'num_dept_38',
       'num_dept_39', 'num_dept_40', 'num_dept_41', 'num_dept_42',
       'num_dept_43', 'num_dept_44', 'num_dept_45', 'num_dept_46',
       'num_dept_47', 'num_dept_48', 'num_dept_49', 'num_dept_50',
       'num_dept_51', 'num_dept_52', 'num_dept_53', 'num_dept_54',
       'num_dept_55', 'num_dept_56', 'num_dept_58', 'num_dept_59',
     

#### 3.5 (re) Prepare X, y as well as training and testing set

In [7]:
# Function to prepare training and testing set
def prep_train_test(df, test_size = 0.25):
    '''
    Extract X and y, Split data into training and testing set
    INPUT:
    df: dataframe
    test_size (float): share of testing set 
    OUTPUT:
    X_train, X_test, y_train, y_test
    '''
    
    col_keep = ['publish', 'delay', 'age', 'legal_form_simple_Societe a responsabilite limitee a associe unique',
                'legal_form_simple_Societe par actions simplifiee', 
                'legal_form_simple_Societe par actions simplifiee a associe unique',
                'legal_form_simple_others']
    
    # Extract X and y
    y = df['rad'].values
    X = df.loc[:,col_keep].values
    
    # Split training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 1)
    
    return X_train, X_test, y_train, y_test
    

In [8]:
# Prepare training and testing set
X_train, X_test, y_train, y_test = prep_train_test(df)

#### 3.6 Model optimization using GridSearchCV

In the previous section we saw that the RandomForestClassifier is having a relatively good performance (aka in our case precision). We will now try to further optimize by tuning the hyperparameters of the algorithm.

In [9]:
# Define the pipeline model with a scaler and a RandomForestClassifier
model = Pipeline([
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier())
])

In [10]:
# New test to try on precision
param_grid_new = {"clf__n_estimators" : [10,20], 
              "clf__criterion": ["gini", "entropy"],
              "clf__max_depth": [None, 5],
              "clf__bootstrap": [True, False],
}

In [11]:
# Instantiate GridSearchCV
model_final_prec = GridSearchCV(model, param_grid_new, verbose = 5, cv=2, scoring = 'precision')
model_final_prec.fit(X_train, y_train)

Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10, score=0.042275172943889314, total=   7.5s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.5s remaining:    0.0s


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10, score=0.033541341653666144, total=   7.8s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   17.2s remaining:    0.0s


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20, score=0.037347560975609755, total=  15.6s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   34.6s remaining:    0.0s


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20, score=0.03553719008264463, total=  15.4s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   52.0s remaining:    0.0s
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=   2.9s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=   3.0s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=   5.3s
[CV] clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=True, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=   4.7s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10 




[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10, score=0.040172166427546625, total=   7.6s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10 




[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10, score=0.037539936102236424, total=   7.5s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=20 




[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=20, score=0.0347758887171561, total=  14.7s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=20 




[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=20, score=0.03709949409780776, total=  14.8s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=10 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=   2.7s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=10 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=   2.8s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=20 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=   5.3s
[CV] clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=20 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=True, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=   5.4s
[CV] clf__bootstrap=False, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10 




[CV]  clf__bootstrap=False, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10, score=0.030791788856304986, total=   7.8s
[CV] clf__bootstrap=False, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10 




[CV]  clf__bootstrap=False, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=10, score=0.03554868624420402, total=   7.5s
[CV] clf__bootstrap=False, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20 




[CV]  clf__bootstrap=False, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20, score=0.027426160337552744, total=  14.6s
[CV] clf__bootstrap=False, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20 




[CV]  clf__bootstrap=False, clf__criterion=gini, clf__max_depth=None, clf__n_estimators=20, score=0.03205629397967162, total=  15.1s
[CV] clf__bootstrap=False, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10 


  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=False, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=   2.4s
[CV] clf__bootstrap=False, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=False, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=   2.8s
[CV] clf__bootstrap=False, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20 


  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=False, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=   4.5s
[CV] clf__bootstrap=False, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=False, clf__criterion=gini, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=   4.9s
[CV] clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10 




[CV]  clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10, score=0.02855051244509517, total=   8.2s
[CV] clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10 




[CV]  clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=10, score=0.03320158102766799, total=   8.1s
[CV] clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=20 




[CV]  clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=20, score=0.028409090909090908, total=  16.1s
[CV] clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=20 




[CV]  clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=None, clf__n_estimators=20, score=0.03168469860896445, total=  15.9s
[CV] clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=10 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=   2.6s
[CV] clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=10 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=10, score=0.0, total=   2.6s
[CV] clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=20 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=   4.7s
[CV] clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=20 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:  4.6min finished


[CV]  clf__bootstrap=False, clf__criterion=entropy, clf__max_depth=5, clf__n_estimators=20, score=0.0, total=   4.8s




GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
      ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'clf__n_estimators': [10, 20], 'clf__criterion': ['gini', 'entropy'], 'clf__max_depth': [None, 5], 'clf__bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='precision', verbose=5)

In [12]:
# Print out parameters of best model to avoid having t re-run the exercise
print("Parameters of the best model are: ",model_final_prec.best_params_)

Parameters of the best model are:  {'clf__bootstrap': True, 'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__n_estimators': 10}


In [13]:
# Extract feature importance to analyse which feature is most relevant
importances_new = pd.DataFrame(model_final_prec.best_estimator_.named_steps["clf"].feature_importances_)

In [16]:
# Extract feature names from original dataframe
col_keep = ['publish', 'delay', 'age', 'legal_form_simple_Societe a responsabilite limitee a associe unique',
                'legal_form_simple_Societe par actions simplifiee', 
                'legal_form_simple_Societe par actions simplifiee a associe unique',
                'legal_form_simple_others']

X_index = df.loc[:, col_keep].columns.transpose()

In [17]:
# Assign new index to importances dataframe
importances_new.index = X_index
importances_new.columns = ["weight"]
feature_importance_new = importances_new
feature_importance_new.head()

Unnamed: 0,weight
publish,0.004429
delay,0.798521
age,0.188033
legal_form_simple_Societe a responsabilite limitee a associe unique,0.003107
legal_form_simple_Societe par actions simplifiee,0.002423


In [18]:
# Analyse which features are most important (sum is equal to 1)
feature_importance_new = feature_importance_new.sort_values(by = "weight", ascending = False)

In [19]:
# How many make the most of the weight
feature_importance_new[:10].sum()

weight    1.0
dtype: float64

In [20]:
feature_importance_new[:10]

Unnamed: 0,weight
delay,0.798521
age,0.188033
publish,0.004429
legal_form_simple_Societe a responsabilite limitee a associe unique,0.003107
legal_form_simple_Societe par actions simplifiee,0.002423
legal_form_simple_others,0.001832
legal_form_simple_Societe par actions simplifiee a associe unique,0.001655


**CONCLUSION**

As expected the age of the company play a major role (which we could guess from the exploration phase as most of the companies going out of business were quite young).

The legal form (as a proxy for capital structure and governance) also plays a role.

The so called delay feature is also relevant, e.g. how long does it take a company to publish its results (often if those are bad, it takes longer to publish as it needs more alignment with auditors).

#### 3.7 Model validation

In [21]:
# Predict on test set
y_preds = model_final_prec.predict(X_test)



In [22]:
# Get performance on test set
print("Precision on test set: {} %".format(np.round(precision_score(y_test, y_preds)*100,3)))
print("Accuracy on test set: {} %".format(np.round(accuracy_score(y_test, y_preds)*100,3)))

Precision on test set: 3.617 %
Accuracy on test set: 97.1 %


In [23]:
# Compute performance lift-off vs basic assumption
print("Basic Precision on test set: {} %".format(np.round(y_test.mean()*100,3)))
print("Basic Accuracy on test set: {} %".format(np.round((1-y_test.mean())*100,3)))

Basic Precision on test set: 2.652 %
Basic Accuracy on test set: 97.348 %


In [24]:
# Lift off
perf_lift_off_prec = precision_score(y_test, y_preds) / y_test.mean() - 1
perf_lift_off_acc = accuracy_score(y_test, y_preds) / (1-y_test.mean()) -1

print("Performance lift-off in terms of precision: {} %".format(np.round(perf_lift_off_prec*100,1)))
print("Performance lift-off in terms of accuracy: {} %".format(np.round(perf_lift_off_acc*100,1)))

Performance lift-off in terms of precision: 36.4 %
Performance lift-off in terms of accuracy: -0.3 %


#### 3.8 Test model on other years

This is also interesting to see if the model also works for other years, as we would like to use it to predict on a year for which we don't yet have results.

In [25]:
# Test on 2016 financial information (and 2018 so called radiations)
df_2017 = load_transform("cleaned_data_2017.csv")

In [27]:
# Extract X and y (across the entire set, the entire set is now a test set)
X_2017 = df_2017.loc[:, col_keep]
y_2017 = df_2017['rad']

In [28]:
# check sizes
print(X_2017.shape)
print(y_2017.shape)

(871669, 7)
(871669,)


In [29]:
# Predict using model
y_preds_2017 = model_final_prec.predict(X_2017)

  Xt = transform.transform(Xt)


In [30]:
# Test on entire set
print("Precision on 2016 data is then: {} %".format(np.round(precision_score(y_2017, y_preds_2017)*100,3)))
print("Accuracy on 2016 data is then: {} %".format(np.round(accuracy_score(y_2017, y_preds_2017)*100,3)))

Precision on 2016 data is then: 2.808 %
Accuracy on 2016 data is then: 97.275 %


In [31]:
# Compute performance lift-off vs basic assumption
print("Basic Precision on test set: {} %".format(np.round(y_2017.mean()*100,3)))
print("Basic Accuracy on test set: {} %".format(np.round((1-y_2017.mean())*100,3)))

Basic Precision on test set: 2.363 %
Basic Accuracy on test set: 97.637 %


In [32]:
# Lift off
perf_lift_off_prec_2017 = precision_score(y_2017, y_preds_2017) / y_2017.mean() - 1
perf_lift_off_acc_2017 = accuracy_score(y_2017, y_preds_2017) / (1-y_2017.mean()) - 1

print("Performance lift-off in terms of precision: {} %".format(np.round(perf_lift_off_prec_2017*100,1)))
print("Performance lift-off in terms of accuracy: {} %".format(np.round(perf_lift_off_acc_2017*100,1)))

Performance lift-off in terms of precision: 18.8 %
Performance lift-off in terms of accuracy: -0.4 %


#### 3.9 Save model

for potential reuse without having to re-run it all

In [None]:
# Version 1
filename = 'finalized_model_2016-18.sav'
pickle.dump(model_final_prec, open(filename, 'wb'))

In [None]:
# Version 2
pkl_filename = "pickle_model_2016-18.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model_final_prec, file)
    
'''
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
'''