In [166]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, roc_auc_score


In [170]:
import os
path = r'C:\Users\tomas\OneDrive - London School of Economics\Tomás Gil Mata\Faculdade\LSE\3rd Year\ST310\Project'
os.chdir(path)

In [171]:
data = pd.read_csv(r"C:\Users\tomas\OneDrive - London School of Economics\Tomás Gil Mata\Faculdade\LSE\3rd Year\ST310\Project\compustat_df_1980_deduplicated_extended_winsor.csv")

Now I am going to remove the observations of deafult if the previous observation is also a default

In [172]:
data1 = data

data1["TL_flag_prev"]= data1.groupby("gvkey")["TL_flag"].shift(1)

data1 = data1[~((data1["TL_flag"] == 1) & (data1["TL_flag_prev"] == 1))].copy()

data1= data1.drop(columns=["TL_flag_prev"])

data1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 486799 entries, 0 to 487605
Data columns (total 61 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   gvkey                         486799 non-null  int64  
 1   datadate                      486799 non-null  object 
 2   fyear                         486799 non-null  int64  
 3   conm                          486799 non-null  object 
 4   tic                           486596 non-null  object 
 5   naicsh                        358347 non-null  float64
 6   revt                          396315 non-null  float64
 7   oibdp                         350713 non-null  float64
 8   ni                            355511 non-null  float64
 9   xint                          376906 non-null  float64
 10  dp                            384709 non-null  float64
 11  at                            398858 non-null  float64
 12  act                           333878 non-null  fl

Now we set Y (the variable we want to predict) and X (the features).

But as we want to make a prediction in time, to predict Y in t, I will use feutures until t-1, I have to lag the features. 

In [173]:
def x_and_y(data, features_to_drop):
    y= data["TL_flag"]

    features = data.drop(columns=features_to_drop)
    features_lagged = features.groupby(data["gvkey"]).shift(1)

    X = features_lagged

    #Now I remove the lines that don't have data for all the features (I only keep the lines that have all the data)

    mask = X.notna().all(axis=1) & y.notna()
    X = X.loc[mask]
    y = y.loc[mask]

    return y, X


For each Y, the corresponding X are the features on the previous period to the observation of Y (TL_flag)

Now I split the data into training set and testing set as well as standardize the features

In [174]:
def split_standardize(data, features_to_drop):
    y= x_and_y(data, features_to_drop)[0]
    X= x_and_y(data, features_to_drop)[1]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size = 0.15,
        stratify = y,
        random_state = 2)
    scaler = StandardScaler()

    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc  = scaler.transform(X_test)

    return y_train, y_test,X_train_sc, X_test_sc 


LDA

In [180]:
def lda (data, features_to_drop):
    y_train = split_standardize(data, features_to_drop)[0]
    y_test =  split_standardize(data, features_to_drop)[1]
    X_train_sc = split_standardize(data, features_to_drop)[2] 
    X_test_sc = split_standardize(data, features_to_drop)[3]
    # Fit LDA
    #1. Create model
    lda = LinearDiscriminantAnalysis()
    #2. Fit model
    lda.fit(X_train_sc, y_train)

    # Predict on test data
    y_pred_lda = lda.predict(X_test_sc)

    # Evaluate the model
    print("Test error rate LDA:")
    print(f"{(1-accuracy_score(y_test, y_pred_lda)) * 100:.2f}%")
    print("\nConfusion Matrix LDA:")
    print(confusion_matrix(y_test, y_pred_lda))
    print("\nClassification report LDA:")
    print(classification_report(y_test, y_pred_lda, digits=4))

    y_prob_lda = lda.predict_proba(X_test_sc)[:, 1]
    auc_lda = roc_auc_score(y_test, y_prob_lda)
    print("\nAUC LDA:")
    print(f"{auc_lda * 100:.2f}%")

    return

QDA

In [181]:
def qda(data, features_to_drop):
    y_train = split_standardize(data, features_to_drop)[0]
    y_test =  split_standardize(data, features_to_drop)[1]
    X_train_sc = split_standardize(data, features_to_drop)[2] 
    X_test_sc = split_standardize(data, features_to_drop)[3]
    # Fit QDA
    #1. Create model
    qda = QuadraticDiscriminantAnalysis()
    #2. Fit model
    qda.fit(X_train_sc, y_train)

    # Predict on test data
    y_pred_qda = qda.predict(X_test_sc)

    # Evaluate the model
    print("Test error rate QDA:")
    print(f"{(1-accuracy_score(y_test, y_pred_qda)) * 100:.2f}%")
    print("\nConfusion Matrix QDA:")
    print(confusion_matrix(y_test, y_pred_qda))
    print("\nClassification report QDA:")
    print(classification_report(y_test, y_pred_qda, digits=4))

    y_prob_qda = qda.predict_proba(X_test_sc)[:, 1]
    auc_qda = roc_auc_score(y_test, y_prob_qda)
    print("\nAUC QDA:")
    print(f"{auc_qda * 100:.2f}%")

    return

In [182]:
data = data1

Now I experiment various combinations of features

0

In [183]:
features_to_drop = ["TL_flag","gvkey","datadate","fyear","conm","tic", "at_fn"]

#this one uses all the features

In [184]:
lda(data, features_to_drop)

Test error rate LDA:
1.07%

Confusion Matrix LDA:
[[17460   130]
 [   59     4]]

Classification report LDA:
              precision    recall  f1-score   support

           0     0.9966    0.9926    0.9946     17590
           1     0.0299    0.0635    0.0406        63

    accuracy                         0.9893     17653
   macro avg     0.5132    0.5281    0.5176     17653
weighted avg     0.9932    0.9893    0.9912     17653


AUC LDA:
88.61%


In [185]:
qda(data, features_to_drop)

Test error rate QDA:
43.45%

Confusion Matrix QDA:
[[9932 7658]
 [  12   51]]

Classification report QDA:
              precision    recall  f1-score   support

           0     0.9988    0.5646    0.7214     17590
           1     0.0066    0.8095    0.0131        63

    accuracy                         0.5655     17653
   macro avg     0.5027    0.6871    0.3673     17653
weighted avg     0.9953    0.5655    0.7189     17653


AUC QDA:
78.06%


1

In [186]:
features_to_drop = ["TL_flag","gvkey","datadate","fyear","conm","tic", "at_fn",
    # raw levels
    "revt","oibdp","ni","xint","dp","at","act","che","lt","lct",
    "dlc","dltt","ceq","csho","dv","total_debt",
    
    # percent changes
    "revt_1_year_pct_change","at_1_year_pct_change","ni_1_year_pct_change",
    "oibdp_1_year_pct_change","ceq_1_year_pct_change","total_debt_1_year_pct_change",
    "revt_2_year_pct_change","at_2_year_pct_change","ni_2_year_pct_change",
    "oibdp_2_year_pct_change","ceq_2_year_pct_change","total_debt_2_year_pct_change",
    "revt_5_year_pct_change","at_5_year_pct_change","ni_5_year_pct_change",
    "oibdp_5_year_pct_change","ceq_5_year_pct_change","total_debt_5_year_pct_change"]

#Traditional ratio-based discriminant analysis - drops raw levels and percentage changes, keeping the traditional ratios


In [187]:
lda(data, features_to_drop)

Test error rate LDA:
0.55%

Confusion Matrix LDA:
[[29506    71]
 [   93     0]]

Classification report LDA:
              precision    recall  f1-score   support

           0     0.9969    0.9976    0.9972     29577
           1     0.0000    0.0000    0.0000        93

    accuracy                         0.9945     29670
   macro avg     0.4984    0.4988    0.4986     29670
weighted avg     0.9937    0.9945    0.9941     29670


AUC LDA:
79.29%


In [188]:
qda(data, features_to_drop)

Test error rate QDA:
11.64%

Confusion Matrix QDA:
[[26164  3413]
 [   41    52]]

Classification report QDA:
              precision    recall  f1-score   support

           0     0.9984    0.8846    0.9381     29577
           1     0.0150    0.5591    0.0292        93

    accuracy                         0.8836     29670
   macro avg     0.5067    0.7219    0.4837     29670
weighted avg     0.9954    0.8836    0.9352     29670


AUC QDA:
79.53%


2

In [189]:
features_to_drop =["TL_flag","gvkey","datadate","fyear","conm","tic", "at_fn",
    # raw levels
    "revt","oibdp","ni","xint","dp","at","act","che","lt","lct",
    "dlc","dltt","ceq","csho","dv","total_debt",
    
    # 2Y + 5Y changes
    "revt_2_year_pct_change","at_2_year_pct_change","ni_2_year_pct_change",
    "oibdp_2_year_pct_change","ceq_2_year_pct_change","total_debt_2_year_pct_change",
    "revt_5_year_pct_change","at_5_year_pct_change","ni_5_year_pct_change",
    "oibdp_5_year_pct_change","ceq_5_year_pct_change","total_debt_5_year_pct_change"]

#Short-term financial deterioration signals - similar to 1) but we don't drop the 1y changes

In [190]:
lda(data, features_to_drop)

Test error rate LDA:
0.67%

Confusion Matrix LDA:
[[25675    89]
 [   84     4]]

Classification report LDA:
              precision    recall  f1-score   support

           0     0.9967    0.9965    0.9966     25764
           1     0.0430    0.0455    0.0442        88

    accuracy                         0.9933     25852
   macro avg     0.5199    0.5210    0.5204     25852
weighted avg     0.9935    0.9933    0.9934     25852


AUC LDA:
81.52%


In [191]:
qda(data, features_to_drop)

Test error rate QDA:
17.40%

Confusion Matrix QDA:
[[21294  4470]
 [   29    59]]

Classification report QDA:
              precision    recall  f1-score   support

           0     0.9986    0.8265    0.9045     25764
           1     0.0130    0.6705    0.0256        88

    accuracy                         0.8260     25852
   macro avg     0.5058    0.7485    0.4650     25852
weighted avg     0.9953    0.8260    0.9015     25852


AUC QDA:
76.99%


3

In [192]:
features_to_drop =["TL_flag","gvkey","datadate","fyear","conm","tic", "at_fn",
    # profitability / efficiency
    "gross_margin","net_profit_margin","roa","roe","asset_turnover",
    
    # growth
    "revt_1_year_pct_change","at_1_year_pct_change","ni_1_year_pct_change",
    "oibdp_1_year_pct_change","ceq_1_year_pct_change","total_debt_1_year_pct_change",
    "revt_2_year_pct_change","at_2_year_pct_change","ni_2_year_pct_change",
    "oibdp_2_year_pct_change","ceq_2_year_pct_change","total_debt_2_year_pct_change",
    "revt_5_year_pct_change","at_5_year_pct_change","ni_5_year_pct_change",
    "oibdp_5_year_pct_change","ceq_5_year_pct_change","total_debt_5_year_pct_change",
    
    # raw levels
    "revt","oibdp","ni","xint","dp","at","act","che","lt","lct",
    "dlc","dltt","ceq","csho","dv","total_debt"]


#Liquidity and leverage dominate default risk. Here we also drop perfomance measures

In [193]:
lda(data, features_to_drop)

Test error rate LDA:
0.50%

Confusion Matrix LDA:
[[30837    60]
 [   94     0]]

Classification report LDA:
              precision    recall  f1-score   support

           0     0.9970    0.9981    0.9975     30897
           1     0.0000    0.0000    0.0000        94

    accuracy                         0.9950     30991
   macro avg     0.4985    0.4990    0.4988     30991
weighted avg     0.9939    0.9950    0.9945     30991


AUC LDA:
77.81%


In [194]:
qda(data, features_to_drop)

Test error rate QDA:
7.70%

Confusion Matrix QDA:
[[28569  2328]
 [   59    35]]

Classification report QDA:
              precision    recall  f1-score   support

           0     0.9979    0.9247    0.9599     30897
           1     0.0148    0.3723    0.0285        94

    accuracy                         0.9230     30991
   macro avg     0.5064    0.6485    0.4942     30991
weighted avg     0.9950    0.9230    0.9571     30991


AUC QDA:
79.40%


4

In [195]:
features_to_drop =["TL_flag","gvkey","datadate","fyear","conm","tic", "at_fn",
    # all ratios
    "gross_margin","net_profit_margin","roa","roe","asset_turnover",
    "cash_to_assets","fixed_asset_intensity","current_ratio","quick_ratio",
    "debt_to_assets","debt_to_equity","liabilities_to_assets",
    "interest_coverage","long_term_debt_ratio",
    "book_value_per_share","earnings_per_share",
    "dividend_payout_ratio","dividend_yield","retention_ratio",
    
    # raw levels
    "revt","oibdp","ni","xint","dp","at","act","che","lt","lct",
    "dlc","dltt","ceq","csho","dv","total_debt"]

#Pure trend-based early warning system. We only keep the changes

In [196]:
lda(data, features_to_drop)

Test error rate LDA:
1.08%

Confusion Matrix LDA:
[[19729   150]
 [   66     2]]

Classification report LDA:
              precision    recall  f1-score   support

           0     0.9967    0.9925    0.9946     19879
           1     0.0132    0.0294    0.0182        68

    accuracy                         0.9892     19947
   macro avg     0.5049    0.5109    0.5064     19947
weighted avg     0.9933    0.9892    0.9912     19947


AUC LDA:
84.87%


In [197]:
qda(data, features_to_drop)

Test error rate QDA:
6.24%

Confusion Matrix QDA:
[[18685  1194]
 [   50    18]]

Classification report QDA:
              precision    recall  f1-score   support

           0     0.9973    0.9399    0.9678     19879
           1     0.0149    0.2647    0.0281        68

    accuracy                         0.9376     19947
   macro avg     0.5061    0.6023    0.4980     19947
weighted avg     0.9940    0.9376    0.9646     19947


AUC QDA:
69.87%


5

In [198]:
features_to_drop =["TL_flag","gvkey","datadate","fyear","conm","tic", "at_fn",
    # equity / payout / per-share
    "book_value_per_share",
    "earnings_per_share",
    "dividend_payout_ratio",
    "dividend_yield",
    "retention_ratio",
    "roe",
    "debt_to_equity",
    
    # raw levels
    "revt","oibdp","ni","xint","dp","at","act","che","lt","lct",
    "dlc","dltt","ceq","csho","dv","total_debt"]

#Market-independent accounting fundamentals - we eliminate the features that are related with the financial market

In [199]:
lda(data, features_to_drop)

Test error rate LDA:
1.26%

Confusion Matrix LDA:
[[17956   167]
 [   62     4]]

Classification report LDA:
              precision    recall  f1-score   support

           0     0.9966    0.9908    0.9937     18123
           1     0.0234    0.0606    0.0338        66

    accuracy                         0.9874     18189
   macro avg     0.5100    0.5257    0.5137     18189
weighted avg     0.9930    0.9874    0.9902     18189


AUC LDA:
82.74%


In [200]:
qda(data, features_to_drop)

Test error rate QDA:
9.57%

Confusion Matrix QDA:
[[16424  1699]
 [   41    25]]

Classification report QDA:
              precision    recall  f1-score   support

           0     0.9975    0.9063    0.9497     18123
           1     0.0145    0.3788    0.0279        66

    accuracy                         0.9043     18189
   macro avg     0.5060    0.6425    0.4888     18189
weighted avg     0.9939    0.9043    0.9463     18189


AUC QDA:
78.96%
