출처: https://www.kaggle.com/code/jacopoferretti/company-bankruptcy-classif-w-feature-selection

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'company-bankruptcy-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1111894%2F1938459%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240930%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240930T035042Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D166c5752a58ec4eb085b430696ad65c45e2d3514870db692c4240b6082877aa63677945830b0f2e129d3f10701f7e909e3953e591732c8e2c0bdb79c97e485877cb85f5d1b6e6c16209156b1ad5bfdac3b2bec935517741eb92698f8caedaeabd88231162e73b27000eca54abfce653e341456bb7d42ad724ec766c72a7e7d8aa5efad18890185bade211d5582df25375a89d32b1ca8fd7a95068536409e71cffaba421ba68d73b386e02696bf3652b2cbbb7bed22bba22f75eb6661fc6a0c563054afa766ecd62c72e3b05cb0ecf2fbc709cb1eb1b0df0577ecba59beea72d8c691b98efa0abaf5a80ec38268f9475f4a84f8314ce4e3114dfd6252e11c9c75'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


---
요약

1. 소개

2. 데이터 읽기
3. 탐색적 데이터 분석
4. 기능 엔지니어링
5. 이진 분류

---

# 1. 소개

여기에 사용된 데이터는 1999년부터 2009년까지 대만 경제 저널에서 수집한 것입니다. 회사 파산은 대만 증권거래소의 비즈니스 규정을 기반으로 정의되었습니다.
<br>이 데이터 세트에는 96개의 열이 있습니다. 하나는 목표 변수 *y*(즉, *파산?*)를 포함하고 나머지 95개는 예측 변수입니다. 이 노트의 목적은 (이진) 분류 기법을 통해 파산한 회사를 예측하는 것입니다.

예측 변수의 수가 매우 많기 때문에 이진 분류를 수행하기 전에 *X* 행렬의 특징 수, 즉 목표 변수가 포함된 열 옆의 95개 열 수를 줄여야 합니다. 이를 위해 다음을 활용하겠습니다:
* 상호 상관관계가 과도한 변수를 식별하기 위한 상관 행렬입니다.
* 차원 축소를 수행하기 위한 PCA(주성분 분석).

# 2. 데이터 읽기 및 EDA 요약


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,KFold,cross_val_score,StratifiedKFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import KFold,cross_val_score,StratifiedKFold,GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from xgboost import XGBClassifier

from warnings import simplefilter
simplefilter("ignore")

In [None]:
# Option to display all columns
pd.set_option('display.max_columns', None)

data = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')

data.head()

In [None]:
print(f'The dataset has {data.shape[0]} rows and {data.shape[1]} columns.')

**Null values and duplicate rows**

In [None]:
data.columns[data.isnull().any()]

In [None]:
data.duplicated().sum()

데이터 세트에 null 값이 없고 중복 행이 없습니다.

**Columns and Their Meaning**

* Y - Bankrupt?: Class label
* X1 - ROA(C) before interest and depreciation before interest: Return On Total Assets(C)
* X2 - ROA(A) before interest and % after tax: Return On Total Assets(A)
* X3 - ROA(B) before interest and depreciation after tax: Return On Total Assets(B)
* X4 - Operating Gross Margin: Gross Profit/Net Sales
* X5 - Realized Sales Gross Margin: Realized Gross Profit/Net Sales
* X6 - Operating Profit Rate: Operating Income/Net Sales
* X7 - Pre-tax net Interest Rate: Pre-Tax Income/Net Sales
* X8 - After-tax net Interest Rate: Net Income/Net Sales
* X9 - Non-industry income and expenditure/revenue: Net Non-operating Income Ratio
* X10 - Continuous interest rate (after tax): Net Income-Exclude Disposal Gain or Loss/Net Sales
* X11 - Operating Expense Rate: Operating Expenses/Net Sales
* X12 - Research and development expense rate: (Research and Development Expenses)/Net Sales
* X13 - Cash flow rate: Cash Flow from Operating/Current Liabilities
* X14 - Interest-bearing debt interest rate: Interest-bearing Debt/Equity
* X15 - Tax rate (A): Effective Tax Rate
* X16 - Net Value Per Share (B): Book Value Per Share(B)
* X17 - Net Value Per Share (A): Book Value Per Share(A)
* X18 - Net Value Per Share (C): Book Value Per Share(C)
* X19 - Persistent EPS in the Last Four Seasons: EPS-Net Income
* X20 - Cash Flow Per Share
* X21 - Revenue Per Share (Yuan ¥): Sales Per Share
* X22 - Operating Profit Per Share (Yuan ¥): Operating Income Per Share
* X23 - Per Share Net profit before tax (Yuan ¥): Pretax Income Per Share
* X24 - Realized Sales Gross Profit Growth Rate
* X25 - Operating Profit Growth Rate: Operating Income Growth
* X26 - After-tax Net Profit Growth Rate: Net Income Growth
* X27 - Regular Net Profit Growth Rate: Continuing Operating Income after Tax Growth
* X28 - Continuous Net Profit Growth Rate: Net Income-Excluding Disposal Gain or Loss Growth
* X29 - Total Asset Growth Rate: Total Asset Growth
* X30 - Net Value Growth Rate: Total Equity Growth
* X31 - Total Asset Return Growth Rate Ratio: Return on Total Asset Growth
* X32 - Cash Reinvestment %: Cash Reinvestment Ratio
* X33 - Current Ratio
* X34 - Quick Ratio: Acid Test
* X35 - Interest Expense Ratio: Interest Expenses/Total Revenue
* X36 - Total debt/Total net worth: Total Liability/Equity Ratio
* X37 - Debt ratio %: Liability/Total Assets
* X38 - Net worth/Assets: Equity/Total Assets
* X39 - Long-term fund suitability ratio (A): (Long-term Liability+Equity)/Fixed Assets
* X40 - Borrowing dependency: Cost of Interest-bearing Debt
* X41 - Contingent liabilities/Net worth: Contingent Liability/Equity
* X42 - Operating profit/Paid-in capital: Operating Income/Capital
* X43 - Net profit before tax/Paid-in capital: Pretax Income/Capital
* X44 - Inventory and accounts receivable/Net value: (Inventory+Accounts Receivables)/Equity
* X45 - Total Asset Turnover
* X46 - Accounts Receivable Turnover
* X47 - Average Collection Days: Days Receivable Outstanding
* X48 - Inventory Turnover Rate (times)
* X49 - Fixed Assets Turnover Frequency
* X50 - Net Worth Turnover Rate (times): Equity Turnover
* X51 - Revenue per person: Sales Per Employee
* X52 - Operating profit per person: Operation Income Per Employee
* X53 - Allocation rate per person: Fixed Assets Per Employee
* X54 - Working Capital to Total Assets
* X55 - Quick Assets/Total Assets
* X56 - Current Assets/Total Assets
* X57 - Cash/Total Assets
* X58 - Quick Assets/Current Liability
* X59 - Cash/Current Liability
* X60 - Current Liability to Assets
* X61 - Operating Funds to Liability
* X62 - Inventory/Working Capital
* X63 - Inventory/Current Liability
* X64 - Current Liabilities/Liability
* X65 - Working Capital/Equity
* X66 - Current Liabilities/Equity
* X67 - Long-term Liability to Current Assets
* X68 - Retained Earnings to Total Assets
* X69 - Total income/Total expense
* X70 - Total expense/Assets
* X71 - Current Asset Turnover Rate: Current Assets to Sales
* X72 - Quick Asset Turnover Rate: Quick Assets to Sales
* X73 - Working capitcal Turnover Rate: Working Capital to Sales
* X74 - Cash Turnover Rate: Cash to Sales
* X75 - Cash Flow to Sales
* X76 - Fixed Assets to Assets
* X77 - Current Liability to Liability
* X78 - Current Liability to Equity
* X79 - Equity to Long-term Liability
* X80 - Cash Flow to Total Assets
* X81 - Cash Flow to Liability
* X82 - CFO to Assets
* X83 - Cash Flow to Equity
* X84 - Current Liability to Current Assets
* X85 - Liability-Assets Flag: 1 if Total Liability exceeds Total Assets, 0 otherwise
* X86 - Net Income to Total Assets
* X87 - Total assets to GNP price
* X88 - No-credit Interval
* X89 - Gross Profit to Sales
* X90 - Net Income to Stockholder's Equity
* X91 - Liability to Equity
* X92 - Degree of Financial Leverage (DFL)
* X93 - Interest Coverage Ratio (Interest expense to EBIT)
* X94 - Net Income Flag: 1 if Net Income is Negative for the last two years, 0 otherwise
* X95 - Equity to Liability

# 3. 탐색적 데이터 분석

### 3.1 데이터의 균형성 또는 불균형성?

목표 변수가 균형을 이루는지 여부를 확인하기 위해 대상 변수를 살펴보겠습니다.

In [None]:
val = data['Bankrupt?'].value_counts().reset_index(name='count')

val

In [None]:
def label_function(val):
    return f'{val:.0f}%'

labels = ['No Bankrupt','Bankrupt']

val.plot(x='Bankrupt?',y='count',kind='pie',autopct=label_function,
                textprops={'fontsize':15},labels=labels,colors=['lightsteelblue','orange'])

plt.ylabel('')
plt.title('Bankrupt Classes',fontsize=20)

plt.tight_layout()

그림에서 볼 수 있듯이 *파산?* 열의 값은 매우 불균형합니다. 따라서 결과의 정확도를 높이기 위해 목표 변수에서 보다 균형 잡힌 클래스(파산 클래스 간의 50대 50 분할 정도)를 얻으려면 데이터를 다시 샘플링해야 할 것입니다.

### 3.2 Outliers

예측 변수의 상자 그림을 통해 이상치가 있는지 여부를 확인합니다.

In [None]:
features = data.drop('Bankrupt?',axis=1).columns

for i in range(24):
    fig,(ax1,ax2,ax3,ax4) = plt.subplots(ncols=4,figsize=(12,5))
    ax1 = sns.boxplot(data[features[i*4]],ax=ax1)
    ax1.set_title(f'Boxplot of X{i*4+1}',fontsize=20)
    ax1.set_facecolor('gold')
    ax2 = sns.boxplot(data[features[i*4+1]],ax=ax2)
    ax2.set_title(f'Boxplot of X{i*4+2}',fontsize=20)
    ax2.set_facecolor('gold')
    ax3 = sns.boxplot(data[features[i*4+2]],ax=ax3)
    ax3.set_title(f'Boxplot of X{i*4+3}',fontsize=20)
    ax3.set_facecolor('gold')
    if i < 23:
        ax4 = sns.boxplot(data[features[i*4+3]],ax=ax4)
        ax4.set_title(f'Boxplot of X{i*4+4}',fontsize=20)
        ax4.set_facecolor('gold')
    else:
        ax4.set_facecolor('gold')

    fig.suptitle("Boxplots of the Outliers",fontsize=24)

    plt.tight_layout()
    fig.set_facecolor('darkgrey')

몇 가지 예외를 제외하고 *X* 변수에는 많은 수의 특이치가 있습니다.

### 3.3 상관관계

In [None]:
f,ax = plt.subplots(figsize=(30, 25))

mat = data.corr('spearman')
mask = np.triu(np.ones_like(mat,dtype=bool))
cmap = sns.diverging_palette(230,20,as_cmap=True)

sns.heatmap(mat,mask=mask,cmap=cmap,vmax=1,center=0,square=True,linewidths=.5,cbar_kws={"shrink":.5})

plt.title('Correlation Heatmap',fontsize=35)

plt.tight_layout()
plt.gcf().patch.set_facecolor('darkgrey')
plt.gca().set_facecolor('darkgrey')

plt.show()

변수 간에는 몇 가지 강력한 상관관계가 있습니다. *R* > 0.9 또는 *R* < -0.9인 특징 쌍을 살펴봅시다.

In [None]:
list1 = features
list2 = features
list_int = []
count = 1

for col1 in list1:
    list_int.append(col1)
    for col2 in list2:
        if col2 not in list_int:
            R_corr = data.corr().loc[col1,col2]
            if R_corr > 0.9 or R_corr < -0.9:
                print(count,'.  ',col1,' -- ',col2,';  R = {:.3f}'.format(R_corr))
                count += 1

At least one of the elements in each pair of features with a high value of the Pearson coefficient *R* will have to be dropped.

# 4. Feature Engineering
<a id="4"></a>

### 4.1 Outliers Capping

이상점은 나머지 데이터에서 크게 눈에 띄는 데이터 포인트입니다. 다른 관측치에 비해 매우 높거나 낮은 값일 수 있으며 측정 오류, 데이터의 자연스러운 변화 또는 예기치 않은 발견으로 인해 발생할 수도 있습니다. 데이터 세트의 이상점(분류 또는 회귀 기법을 통해 연구하고자 함)이 예측 성능을 저하시킬 수 있으므로 이를 처리해야 한다는 것은 잘 알려져 있습니다.

이상값은 데이터의 첫 번째 및 세 번째 사분위수로 정의된 (각 열에 대한) 하한 및 상한 임계값을 설정하여 제한됩니다.
<br>캡핑 절차는 플로트 변수에 대해서만 수행해야 합니다. 정수 유형의 변수는 이 절차에서 제외해야 합니다.

In [None]:
# Function for outliers capping
def outlier_imputer(data,features):

    data_out = data.copy()

    for column in features:

        # First define the first and third quartiles
        Q1 = data_out[column].quantile(0.25)
        Q3 = data_out[column].quantile(0.75)
        # Define the inter-quartile range
        IQR = Q3 - Q1
        # ... and the lower/higher threshold values
        lowerL = (Q1 - 1.5 * IQR)
        higherL = (Q3 + 1.5 * IQR)

        # Impute 'left' outliers
        data_out.loc[data_out[column] < lowerL,column] = lowerL
        # Impute 'right' outliers
        data_out.loc[data_out[column] > higherL,column] = higherL

    return data_out

# Outliers capping is carried out on the float columns only
data_in     = data.select_dtypes(include='float')
features_in = data_in.columns

capped_data = outlier_imputer(data_in,features_in)

In [None]:
# Integer-type columns are concatenated to the capped dataset
data_int = data.select_dtypes(include='int')

data2 = pd.concat([capped_data,data_int],axis=1)

data2.head()

In [None]:
for i in range(24):
    fig,(ax1,ax2,ax3,ax4) = plt.subplots(ncols=4,figsize=(12,5))
    ax1 = sns.boxplot(data2[features[i*4]],ax=ax1)
    ax1.set_title(f'Boxplot of X{i*4+1}',fontsize=20)
    ax1.set_facecolor('gold')
    ax2 = sns.boxplot(data2[features[i*4+1]],ax=ax2)
    ax2.set_title(f'Boxplot of X{i*4+2}',fontsize=20)
    ax2.set_facecolor('gold')
    ax3 = sns.boxplot(data2[features[i*4+2]],ax=ax3)
    ax3.set_title(f'Boxplot of X{i*4+3}',fontsize=20)
    ax3.set_facecolor('gold')
    if i < 23:
        ax4 = sns.boxplot(data2[features[i*4+3]],ax=ax4)
        ax4.set_title(f'Boxplot of X{i*4+4}',fontsize=20)
        ax4.set_facecolor('gold')
    else:
        ax4.set_facecolor('gold')

    fig.suptitle("Boxplots of the Outliers",fontsize=24)

    plt.tight_layout()
    fig.set_facecolor('darkgrey')

위의 상자 그림은 특이치 캡핑 절차가 성공했음을 보여줍니다.

### 4.2 리샘플링

목표 값의 약 97%가 이진 클래스 중 하나에 속하기 때문에 데이터는 매우 불균형합니다. 보다 정확한 예측을 얻으려면 테스트 데이터에서 이러한 클래스를 50-50으로 분할해야 합니다.

**Train-Test Splitting**

*X* 및 *y* 변수를 정의하고 있습니다.

In [None]:
X = data2.drop('Bankrupt?',axis=1)
y = data2['Bankrupt?']

Then, I am performing the train-test splitting.

In [None]:
# Train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)

**Oversampling**

오버샘플링 절차부터 시작하겠습니다. 소수 클래스에 속하는 목표 변수 값을 가진 데이터 세트의 행을 복사하여 그 수가 다수 클래스에 속하는 목표 변수 값을 가진 행의 수와 동일하도록 하는 것입니다.

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# Balancing the data
X_overSampled,y_overSampled = smote.fit_resample(X_train,y_train)

In [None]:
len(X_train), len(X_overSampled)

**Undersampling**

언더샘플링은 오버샘플링과는 정반대입니다. 이 두 번째 경우, 다수 클래스에 속하는 목표 변수 값을 가진 행의 수를 잘라내어 소수 클래스에 속하는 목표 변수 값을 가진 행의 수와 동일하게 만듭니다.

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Create a RandomUnderSampler object
rus = RandomUnderSampler(random_state=42,sampling_strategy='majority')

# Balancing the data
X_underSampled,y_underSampled = rus.fit_resample(X_train,y_train)

In [None]:
len(X_train), len(X_underSampled)

# 5. Binary Classification with Feature Selection
<a id="5"></a>

저는 상대적인 특징 중요도가 높고(임의 포레스트 분류기를 사용하여 계산) 다른 변수와 강한 상관관계가 없는 특징을 '수동적으로' 선택하는 특징 선택 방법을 사용하고 있습니다.

**Scaling**

In [None]:
X_test_under = X_test.copy()

### Undersampled data ###
LABELS = X_underSampled.columns

## Scaling ##
mm_scaler = MinMaxScaler()

# Apply Min-Max Scaling
X_underSampled[LABELS] = mm_scaler.fit_transform(X_underSampled[LABELS])
X_test_under[LABELS] = mm_scaler.transform(X_test_under[LABELS])

In [None]:
X_test_over = X_test.copy()

### Oversampled data ###
LABELS = X_overSampled.columns

## Scaling ##
mm_scaler = MinMaxScaler()

# Apply Min-Max Scaling
X_overSampled[LABELS] = mm_scaler.fit_transform(X_overSampled[LABELS])
X_test_over[LABELS] = mm_scaler.transform(X_test_over[LABELS])

**Feature Importance**

저는 무작위 포레스트 분류기를 사용하여 예측 변수의 상대적 중요도를 계산하고 있습니다. 이 방법은 100% 신뢰할 수는 없지만 분류 단계에서 가장 유용하거나 덜 유용한 기능에 대한 징후를 제공할 수 있습니다. 저는 **언더샘플링된 열차 데이터**를 사용할 것입니다.

In [None]:
# Random Forest Model
random_forest = RandomForestClassifier(random_state=1,max_depth=4)
random_forest.fit(X_underSampled,y_underSampled)

importances = pd.DataFrame({'feature':X_underSampled.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)

importances

나는 2% 이상의 상대적인 중요도로만 기능을 유지하고 있습니다.

In [None]:
importances = importances[importances['importance'] > 0.02]

importances

**Dropping some features**

*feature* 열에 표시되는 기능을 목록으로 변경합니다.

In [None]:
list_features = importances['feature'].to_list()

list_features

Some of these features have strong mutual correlations, like ROA(C) vs ROA(B). This is why some of them must be dropped. I will drop those that are here in the list below and also have lower feature importance.

1.    ROA(C) before interest and depreciation before interest  --   ROA(A) before interest and % after tax ;  R = 0.940
2.    ROA(C) before interest and depreciation before interest  --   ROA(B) before interest and depreciation after tax ;  R = 0.987
3.    ROA(A) before interest and % after tax  --   ROA(B) before interest and depreciation after tax ;  R = 0.956
4.    ROA(A) before interest and % after tax  --   Net Income to Total Assets ;  R = 0.962
5.    ROA(B) before interest and depreciation after tax  --   Net Income to Total Assets ;  R = 0.912
6.    Operating Gross Margin  --   Realized Sales Gross Margin ;  R = 1.000
7.    Operating Gross Margin  --   Gross Profit to Sales ;  R = 1.000
8.    Realized Sales Gross Margin  --   Gross Profit to Sales ;  R = 1.000
9.    Operating Profit Rate  --   Pre-tax net Interest Rate ;  R = 0.916
10.    Operating Profit Rate  --   Continuous interest rate (after tax) ;  R = 0.916
11.    Pre-tax net Interest Rate  --   After-tax net Interest Rate ;  R = 0.986
12.    Pre-tax net Interest Rate  --   Continuous interest rate (after tax) ;  R = 0.994
13.    After-tax net Interest Rate  --   Continuous interest rate (after tax) ;  R = 0.984
14.    Net Value Per Share (B)  --   Net Value Per Share (A) ;  R = 0.999
15.    Net Value Per Share (B)  --   Net Value Per Share (C) ;  R = 0.999
16.    Net Value Per Share (A)  --   Net Value Per Share (C) ;  R = 1.000
17.    Persistent EPS in the Last Four Seasons  --   Per Share Net profit before tax (Yuan ¥) ;  R = 0.956
18.    Persistent EPS in the Last Four Seasons  --   Net profit before tax/Paid-in capital ;  R = 0.959
19.    Operating Profit Per Share (Yuan ¥)  --   Operating profit/Paid-in capital ;  R = 0.999
20.    Per Share Net profit before tax (Yuan ¥)  --   Net profit before tax/Paid-in capital ;  R = 0.963
21.    After-tax Net Profit Growth Rate  --   Regular Net Profit Growth Rate ;  R = 0.996
22.    Debt ratio %  --   Net worth/Assets ;  R = -1.000
23.    Borrowing dependency  --   Liability to Equity ;  R = 0.956
24.    Current Liabilities/Liability  --   Current Liability to Liability ;  R = 1.000
25.    Current Liabilities/Equity  --   Current Liability to Equity ;  R = 1.000
26.    Current Liabilities/Equity  --   Liability to Equity ;  R = 0.964
27.    Working capitcal Turnover Rate  --   Cash Flow to Sales ;  R = 0.948
28.    Current Liability to Equity  --   Liability to Equity ;  R = 0.964

I will drop:
* ROA(A)
* ROA(C)
* Per Share Net profit before tax (Yuan ¥)
* Net profit before tax/Paid-in capital
* Debt ratio %
* Liability to Equity

In [None]:
list_remove = [' ROA(A) before interest and % after tax',
               ' ROA(C) before interest and depreciation before interest',
               ' Per Share Net profit before tax (Yuan ¥)',
               ' Net profit before tax/Paid-in capital',
               ' Debt ratio %',
               ' Liability to Equity']

list_features = list(set(list_features) - set(list_remove))

list_features

Then, I am keeping in the *X* matrix only the features that are displayed in this list.

In [None]:
X_train_US2 = X_underSampled[X_underSampled.columns.intersection(list_features)]
X_test_US2  = X_test_under[X_test_under.columns.intersection(list_features)]

X_train_OS2 = X_overSampled[X_overSampled.columns.intersection(list_features)]
X_test_OS2  = X_test_over[X_test_over.columns.intersection(list_features)]

X_test_US2.head()

**Binary Classification with a Random Forest Classifier on Undersampled Data**

Now, I am performing a binary classification with a random forest classifier on undersampled data.

I am defining a function to get the most common scores on the classification outcome.

In [None]:
def get_test_scores(model_name:str,preds,y_test_data):
    '''
    Generate a table of test scores.

    In:
        model_name (string): Your choice: how the model will be named in the output table
        preds: numpy array of test predictions
        y_test_data: numpy array of y_test data

    Out:
        table: a pandas df of precision, recall, f1, and accuracy scores for your model
    '''
    accuracy  = accuracy_score(y_test_data,preds)
    precision = precision_score(y_test_data,preds,average='macro')
    recall    = recall_score(y_test_data,preds,average='macro')
    f1        = f1_score(y_test_data,preds,average='macro')

    table = pd.DataFrame({'model': [model_name],'precision': [precision],'recall': [recall],
                          'F1': [f1],'accuracy': [accuracy]})

    return table

I am fitting the model on the train data, calculating the scores on the train data ...

In [None]:
random_forest = RandomForestClassifier(class_weight='balanced')

random_forest.fit(X_train_US2,y_underSampled)

# Use the model to predict on train data
rf_train_preds = random_forest.predict(X_train_US2)

rf_train_results = get_test_scores('RF (train, undersampled)',rf_train_preds,y_underSampled)
rf_train_results

... then, I am calculating the scores on the test data.

In [None]:
# Use the model to predict on test data
rf_test_preds = random_forest.predict(X_test_US2)

rf_test_results = get_test_scores('RF (test, undersampling)',rf_test_preds,y_test)
rf_test_results

Precision and F1 scores are not very good. Let's try to improve them.

**Binary Classification with a XGBoost Classifier on Undersampled Data**

I am doing the same as before, but this time I will use XGBoost instead of a random forest classifier.

In [None]:
# Instantiate the XGBoost classifier
xgb1 = XGBClassifier(objective='binary:logistic',random_state=42)

xgb1.fit(X_train_US2,y_underSampled)

# Use the model to predict on train data
xgb_train_preds = xgb1.predict(X_train_US2)

xgb_train_results = get_test_scores('XGB (train, undersampled)',xgb_train_preds,y_underSampled)
xgb_train_results

In [None]:
# Use the model to predict on test data
xgb_test_preds = xgb1.predict(X_test_US2)

xgb_test_results = get_test_scores('XGB (test, undersampling)',xgb_test_preds,y_test)
xgb_test_results

The results are worse than the random forest ones.

**Binary Classification with a XGBoost Classifier (and GridSearch + Cross Validation) on Undersampled Data**

I am using a XGBoost classifier with GridSearch and cross validation. This will make it possible to vary the parameter values and find the best combination of them.

In [None]:
# Instantiate the XGBoost classifier
xgb2 = XGBClassifier(objective='binary:logistic',random_state=42)

# 2. Create a dictionary of hyperparameters to tune
cv_params = {'max_depth':[None,2,3,4,5],'min_child_weight':[4,5,6],'learning_rate': [0.01,0.05,0.1,0.2],
             'n_estimators':[150,200,300]}

# 3. Define a dictionary of scoring metrics to capture
scoring = {'accuracy','precision','recall','f1'}

# 4. Instantiate the GridSearchCV object
xgb_cv = GridSearchCV(xgb2,cv_params,scoring=scoring,cv=4,refit='accuracy')

In [None]:
# Model fitting on the train data
xgb_cv.fit(X_train_US2,y_underSampled)

In [None]:
# Use the model to predict on the test data
xgb_cv_test_preds = xgb_cv.best_estimator_.predict(X_test_US2)

xgb_cv_test_results = get_test_scores('XGB CV (test, undersampling)',xgb_cv_test_preds,y_test)

xgb_cv_test_results

These results are not better than those obtained with the random forest classifier.

**Confusion Matrix**

In [None]:
# Generate array of values for confusion matrix
cm_rf_test     = confusion_matrix(y_test,rf_test_preds,labels=random_forest.classes_)
cm_xgb_test    = confusion_matrix(y_test,xgb_test_preds,labels=xgb1.classes_)
cm_xgb_cv_test = confusion_matrix(y_test,xgb_cv_test_preds,labels=xgb_cv.classes_)

### Creating the confusion matrices ###
fig,(ax1,ax2,ax3) = plt.subplots(ncols=3,figsize=(12,4))

ax1 = sns.heatmap(cm_rf_test,annot=True,ax=ax1,fmt='.4g')
ax1.xaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax1.yaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax1.set_title('Random Forest (Test)',fontsize=18)

ax2 = sns.heatmap(cm_xgb_test,annot=True,ax=ax2,fmt='.4g')
ax2.xaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax2.yaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax2.set_title('XGBoost (Test)',fontsize=18)

ax3 = sns.heatmap(cm_xgb_cv_test,annot=True,ax=ax3,fmt='.4g')
ax3.xaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax3.yaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax3.set_title('XGBoost CV (Test)',fontsize=18)

fig.suptitle("Confusion Matrices of Different Models (Undersampling)",fontsize=24)

plt.tight_layout()
fig.set_facecolor('darkgrey')

The random forest classifier provides the best results with undersampling.

**Binary Classification with a Random Forest Classifier on Oversampled Data**

I am doing basically the same as before, but I will use oversampled instead of undersampled data.

In [None]:
random_forest = RandomForestClassifier(class_weight='balanced')

random_forest.fit(X_train_OS2,y_overSampled)

# Use the model to predict on train data
rf_train_preds = random_forest.predict(X_train_OS2)

rf_train_results = get_test_scores('RF (train, oversampled)',rf_train_preds,y_overSampled)
rf_train_results

In [None]:
# Use the model to predict on test data
rf_test_preds = random_forest.predict(X_test_OS2)

rf_test_over_results = get_test_scores('RF (test, oversampling)',rf_test_preds,y_test)
rf_test_over_results

It seems that the use of an oversampled train set improves the results.

**Binary Classification with a XGBoost Classifier (and GridSearch + Cross Validation) on Oversampled Data**

In [None]:
%%time
xgb_cv.fit(X_train_OS2,y_overSampled)

In [None]:
# Use the model to predict on test data
xgb_cv_test_preds = xgb_cv.best_estimator_.predict(X_test_OS2)

xgb_cv_test_over_results = get_test_scores('XGB CV (test, oversampling)',xgb_cv_test_preds,y_test)

xgb_cv_test_over_results

**Binary Classification with a Random Forest Classifier (and GridSearch + Cross Validation) on Oversampled Data**

In [None]:
# 1. Instantiate the random forest classifier
rf = RandomForestClassifier(random_state=42)

# 2. Create a dictionary of hyperparameters to tune
cv_params = {'max_depth': [2,3,4,5,None],'max_features': [1.0],'max_samples': [1.0],
             'min_samples_leaf': [2,3,4],'min_samples_split': [2,3,4],'n_estimators': [200,300,400]}

# 3. Define a dictionary of scoring metrics to capture
scoring = {'accuracy','precision','recall','f1'}

# 4. Instantiate the GridSearchCV object
rf_cv = GridSearchCV(rf,cv_params,scoring=scoring,cv=4,refit='accuracy')

In [None]:
%%time
rf_cv.fit(X_train_OS2,y_overSampled)

In [None]:
# Use the model to predict on test data
rf_grid_test_preds = rf_cv.best_estimator_.predict(X_test_OS2)

rf_grid_test_over_results = get_test_scores('RF Grid (test, oversampling)',rf_grid_test_preds,y_test)

rf_grid_test_over_results

In [None]:
# Generate array of values for confusion matrix
cm_rf2_test     = confusion_matrix(y_test,rf_test_preds,labels=random_forest.classes_)
cm_xgb2_test    = confusion_matrix(y_test,xgb_cv_test_preds,labels=xgb_cv.classes_)
cm_rf2_cv_test  = confusion_matrix(y_test,rf_grid_test_preds,labels=rf_cv.classes_)

### Creating the confusion matrices ###
fig,(ax1,ax2,ax3) = plt.subplots(ncols=3,figsize=(12,4))

ax1 = sns.heatmap(cm_rf2_test,annot=True,ax=ax1,fmt='.4g')
ax1.xaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax1.yaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax1.set_title('Random Forest (Test)',fontsize=18)

ax2 = sns.heatmap(cm_xgb2_test,annot=True,ax=ax2,fmt='.4g')
ax2.xaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax2.yaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax2.set_title('XGBoost (Test)',fontsize=18)

ax3 = sns.heatmap(cm_rf2_cv_test,annot=True,ax=ax3,fmt='.4g')
ax3.xaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax3.yaxis.set_ticklabels(['No Bankrupt','Bankrupt'])
ax3.set_title('Random Forest CV (Test)',fontsize=18)

fig.suptitle("Confusion Matrices of Different Models (Oversampling)",fontsize=24)

plt.tight_layout()
fig.set_facecolor('darkgrey')

**Comparison Between the Different Models**

In [None]:
results = pd.concat([rf_test_results,xgb_test_results,xgb_cv_test_results,
                     rf_test_over_results,xgb_cv_test_over_results,rf_grid_test_over_results],axis=0)

results.sort_values(by=['accuracy'],ascending=False)

# Useful Reads

[1] Jacopo Ferretti, [*Bank Marketing Campaign (Segmentation + Binary Classification)*](https://www.kaggle.com/code/jacopoferretti/bank-marketing-campaign-segment-binary-classif), Kaggle.

[2] Jacopo Ferretti, [*Credit Card Fraud Detection (Resampling + Cross Validation)*](https://www.kaggle.com/code/jacopoferretti/credit-card-fraud-detection-resampling-cross-val), Kaggle.