In [78]:
# Libraries
import pandas as pd, numpy as np
from utils import *
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Credit Risk Analysis - Machine Learning models <br>

The objective of this analysis is to understand which are the factors that are more effecting the probability of having an high credit risk of a given company, using initially only the data of the explanatory variables of 2019 to predict the credit risk of 2020 (true values vs forecasts). <br>
Then it can be added new explanatory variables 'delayed' e.g. leverage to 2018, 2017, etc. <br>
After understood some general behavior (through the previous notebook), this specific notebook has the objective to build different models to predict the credit risk level (low/high) in an year for each company, and also compare each presented model (in order to both interpret them and see their accuracy).

**Financial Data Science (UniPV) - Prof. Paolo Giudici** <br>
*Computer Engineering (Data Science) - A.Y. 2022/23* <br>
*Francesco Amato, 507767*

## Import the dataset & data manipulations

In [53]:
# Load the datatset
companies_df = pd.read_csv('../dataset/cleaned-credit-risk.tar.gz', compression='gzip')

for col in [c for c in companies_df.columns if c.startswith('MScore.')]:
    companies_df[str(col)+'.int'] = companies_df[str(col)].apply(lambda x: MScore_to_int(x))

companies_df

Unnamed: 0,Company name,Turnover.2020,Turnover.2019,Turnover.2018,Turnover.2017,Turnover.2016,Turnover.2015,EBIT.2020,EBIT.2019,EBIT.2018,...,TAsset.2018,TAsset.2017,TAsset.2016,TAsset.2015,MScore.2020.int,MScore.2019.int,MScore.2018.int,MScore.2017.int,MScore.2016.int,MScore.2015.int
0,LENDLEASE S.R.L.,29458,16716,9612,8097,7941.0,5600.0,-1556.0,-4540.0,623.0,...,15455,15992,13597.0,11659.0,1,1,1,1,0,1
1,PRICEWATERHOUSECOOPERS BUSINESS SERVICES SRL (...,16731,16403,16843,12241,9252.0,9515.0,1838.0,841.0,2738.0,...,16468,10773,6697.0,8933.0,0,0,0,1,1,1
2,EVISO S.P.A.,48568,43039,34302,25791,19760.0,6941.0,1661.0,1464.0,976.0,...,7371,5432,4170.0,2862.0,0,0,0,0,0,0
3,CASA SERVICE MACHINE,47999,43484,43043,41682,51267.0,52584.0,416.0,255.0,-855.0,...,25729,21632,25403.0,24941.0,0,0,1,0,0,0
4,PANFERTIL SPA,45948,47336,45626,48222,57074.0,62263.0,44.0,713.0,-672.0,...,36205,38423,41847.0,41323.0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121003,ASTOR VILLAGE S.R.L.,3161,4635,4742,4499,4277.0,3650.0,985.0,1818.0,1790.0,...,14438,13054,12243.0,11695.0,0,0,0,0,0,0
121004,ODONE & SLOA S.R.L.,3161,2562,2559,2334,3692.0,2537.0,60.0,101.0,27.0,...,2351,2521,2797.0,3152.0,0,1,1,1,1,1
121005,GARRIDO MURO SOCIEDAD LIMITADA,3161,3146,2989,3101,2746.0,3154.0,260.0,13.0,48.0,...,1692,1843,1773.0,1699.0,0,0,0,0,0,0
121006,CENTRO INGROSSO JOLLY S.R.L.,3161,2519,2290,2244,1761.0,1821.0,74.0,48.0,60.0,...,2604,2474,1546.0,1222.0,0,0,0,0,0,1


### Balancing the companies (use same number for high and low level credit risk)

In [54]:
# Fix the unbalanced case
high_risk_df = companies_df[companies_df['MScore.2019.int'] == 1]
low_risk_df = companies_df[companies_df['MScore.2019.int'] == 0].sample(n=len(high_risk_df), random_state=0)
restricted_df = pd.concat([low_risk_df, high_risk_df])
restricted_df.sort_index(inplace=True)

## Consider 2019 company data to predict 2020 credit risk level

### Models

#### Easiest model: credit risk score of 2019 to predict the one of 2020

In [55]:
X = restricted_df[['MScore.2019.int']]
y = restricted_df[['MScore.2020.int']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, shuffle=True)

##### Logistic Regression - Decision Tree - Random Forest - Support Vector Classifier

In [56]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, np.ravel(y_train))
print_performances('Logistic Regression', lr, X_train, y_train, X_test, y_test)

Logistic Regression
 - Train accuracy: 77.8%
 - Test accuracy: 77.7%

Test               precision    recall  f1-score   support

           0       0.92      0.72      0.81      3167
           1       0.63      0.89      0.74      1697

    accuracy                           0.78      4864
   macro avg       0.78      0.80      0.77      4864
weighted avg       0.82      0.78      0.78      4864



In [57]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, np.ravel(y_train))
print_performances('Decision Tree Classifier', dtc, X_train, y_train, X_test, y_test)

Decision Tree Classifier
 - Train accuracy: 77.8%
 - Test accuracy: 77.7%

Test               precision    recall  f1-score   support

           0       0.92      0.72      0.81      3167
           1       0.63      0.89      0.74      1697

    accuracy                           0.78      4864
   macro avg       0.78      0.80      0.77      4864
weighted avg       0.82      0.78      0.78      4864



In [58]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, np.ravel(y_train))
print_performances('Random Forest Classifier', rfc, X_train, y_train, X_test, y_test)

Random Forest Classifier
 - Train accuracy: 77.8%
 - Test accuracy: 77.7%

Test               precision    recall  f1-score   support

           0       0.92      0.72      0.81      3167
           1       0.63      0.89      0.74      1697

    accuracy                           0.78      4864
   macro avg       0.78      0.80      0.77      4864
weighted avg       0.82      0.78      0.78      4864



In [59]:
svc = SVC(kernel='rbf', random_state=0)
svc.fit(X_train, np.ravel(y_train))
print_performances('Support Vector Classifier', svc, X_train, y_train, X_test, y_test)

Support Vector Classifier
 - Train accuracy: 77.8%
 - Test accuracy: 77.7%

Test               precision    recall  f1-score   support

           0       0.92      0.72      0.81      3167
           1       0.63      0.89      0.74      1697

    accuracy                           0.78      4864
   macro avg       0.78      0.80      0.77      4864
weighted avg       0.82      0.78      0.78      4864



The overall obtained performances can be considered good, but what if we add and consider other features?<br>
Furthermore, considering different models, really does not make a significant difference (at least in the performance metrics).

#### Add another feature: credit risk score + turnover of 2019, to predict credit score of 2020

In [91]:
X = restricted_df[['MScore.2019.int', 'Turnover.2019']]
y = restricted_df[['MScore.2020.int']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, shuffle=True)

##### Logistic Regression

In [92]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, np.ravel(y_train))
print_performances('Logistic Regression', lr, X_train, y_train, X_test, y_test)

Logistic Regression
 - Train accuracy: 77.7%
 - Test accuracy: 77.7%

Test               precision    recall  f1-score   support

           0       0.92      0.72      0.81      3167
           1       0.63      0.89      0.73      1697

    accuracy                           0.78      4864
   macro avg       0.77      0.80      0.77      4864
weighted avg       0.82      0.78      0.78      4864



It doesn't really makes the difference adding the Turnover in the performance metrics. The same happens using different models.

#### Add another feature: credit risk score + EBIT of 2019, to predict the one of 2020

In [103]:
X = restricted_df[['MScore.2019.int', 'EBIT.2019']]
y = restricted_df[['MScore.2020.int']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, shuffle=True)

##### Logistic Regression

In [104]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, np.ravel(y_train))
print_performances('Logistic Regression', lr, X_train, y_train, X_test, y_test)

Logistic Regression
 - Train accuracy: 77.6%
 - Test accuracy: 77.4%

Test               precision    recall  f1-score   support

           0       0.92      0.72      0.81      3167
           1       0.63      0.88      0.73      1697

    accuracy                           0.77      4864
   macro avg       0.77      0.80      0.77      4864
weighted avg       0.82      0.77      0.78      4864



#### Add another feature: credit risk score + PLTax of 2019, to predict the one of 2020

In [107]:
X = restricted_df[['MScore.2019.int', 'PLTax.2019']]
y = restricted_df[['MScore.2020.int']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, shuffle=True)

In [108]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, np.ravel(y_train))
print_performances('Logistic Regression', lr, X_train, y_train, X_test, y_test)

Logistic Regression
 - Train accuracy: 77.7%
 - Test accuracy: 77.6%

Test               precision    recall  f1-score   support

           0       0.92      0.72      0.81      3167
           1       0.63      0.88      0.73      1697

    accuracy                           0.78      4864
   macro avg       0.77      0.80      0.77      4864
weighted avg       0.82      0.78      0.78      4864



## Consider also older than 2019 company data to predict 2020 credit risk level

### Models

#### Easiest model: credit risk score of past years to predict the one of 2020

In [65]:
X = restricted_df[['MScore.2015.int', 'MScore.2016.int','MScore.2017.int','MScore.2018.int', 'MScore.2019.int']]
y = restricted_df[['MScore.2020.int']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2, shuffle=True)

##### Logistic Regression - Decision Tree - Random Forest - Support Vector Classifier

In [66]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, np.ravel(y_train))
print_performances('Logistic Regression', lr, X_train, y_train, X_test, y_test)

Logistic Regression
 - Train accuracy: 78.9%
 - Test accuracy: 79.6%

Test               precision    recall  f1-score   support

           0       0.86      0.83      0.84      3167
           1       0.70      0.74      0.72      1697

    accuracy                           0.80      4864
   macro avg       0.78      0.78      0.78      4864
weighted avg       0.80      0.80      0.80      4864



In [67]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, np.ravel(y_train))
print_performances('Decision Tree Classifier', dtc, X_train, y_train, X_test, y_test)

Decision Tree Classifier
 - Train accuracy: 79.0%
 - Test accuracy: 79.6%

Test               precision    recall  f1-score   support

           0       0.84      0.85      0.84      3167
           1       0.71      0.70      0.71      1697

    accuracy                           0.80      4864
   macro avg       0.78      0.77      0.77      4864
weighted avg       0.80      0.80      0.80      4864



In [68]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, np.ravel(y_train))
print_performances('Random Forest Classifier', rfc, X_train, y_train, X_test, y_test)

Random Forest Classifier
 - Train accuracy: 79.0%
 - Test accuracy: 79.6%

Test               precision    recall  f1-score   support

           0       0.84      0.85      0.84      3167
           1       0.71      0.70      0.71      1697

    accuracy                           0.80      4864
   macro avg       0.78      0.77      0.77      4864
weighted avg       0.80      0.80      0.80      4864



In [69]:
svc = SVC(kernel='rbf', random_state=0)
svc.fit(X_train, np.ravel(y_train))
print_performances('Support Vector Classifier', svc, X_train, y_train, X_test, y_test)

Support Vector Classifier
 - Train accuracy: 79.0%
 - Test accuracy: 79.5%

Test               precision    recall  f1-score   support

           0       0.85      0.84      0.84      3167
           1       0.70      0.72      0.71      1697

    accuracy                           0.80      4864
   macro avg       0.77      0.78      0.78      4864
weighted avg       0.80      0.80      0.80      4864

