In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from scipy.stats import shapiro
from scipy.stats import probplot
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency
import seaborn as sns

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (roc_auc_score, roc_curve, auc, confusion_matrix, \
                             accuracy_score, classification_report, plot_confusion_matrix, \
                             plot_precision_recall_curve, precision_recall_curve, recall_score,
                             plot_roc_curve)

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

In [6]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [7]:
TRAIN_DATASET_PATH = './train.csv'
TEST_DATASET_PATH = './test.csv'

In [8]:
df = pd.read_csv(TRAIN_DATASET_PATH)
df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [9]:
tf = pd.read_csv(TEST_DATASET_PATH)
tf.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
0,Rent,,4 years,0.0,9.0,12.5,220968.0,0.0,70.0,0.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,
1,Rent,231838.0,1 year,0.0,6.0,32.7,55946.0,0.0,8.0,0.0,educational expenses,Short Term,78298.0,46037.0,2318.0,699.0
2,Home Mortgage,1152540.0,3 years,0.0,10.0,13.7,204600.0,0.0,,0.0,debt consolidation,Short Term,200178.0,146490.0,18729.0,7260.0
3,Home Mortgage,1220313.0,10+ years,0.0,16.0,17.0,456302.0,0.0,70.0,0.0,debt consolidation,Short Term,217382.0,213199.0,27559.0,739.0
4,Home Mortgage,2340952.0,6 years,0.0,11.0,23.6,1207272.0,0.0,,0.0,debt consolidation,Long Term,777634.0,425391.0,42605.0,706.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [11]:
df.describe(include='object')

Unnamed: 0,Home Ownership,Years in current job,Purpose,Term
count,7500,7129,7500,7500
unique,4,11,15,2
top,Home Mortgage,10+ years,debt consolidation,Short Term
freq,3637,2332,5944,5556


In [12]:
df['Term'].unique()

array(['Short Term', 'Long Term'], dtype=object)

In [13]:
k = 1
for i in df['Term'].unique():
    df.loc[df.Term == i, 'Term'] = k
    k = k + 1
    
df['Term']=  pd.to_numeric(df.Term, errors='coerce')

In [14]:
t = ['Home Ownership', 'Years in current job', 'Purpose']
for tt in t:
    k = 1
    for i in df[tt].unique():
        df.loc[df[tt] == i, tt] = k
        k = k + 1
    df[tt]=  pd.to_numeric(df[tt], errors='coerce')

In [15]:
for tt in t:
    print(df[tt].unique())

[1 2 3 4]
[nan  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [16]:
df['Years in current job'].fillna(0, inplace=True)
df['Home Ownership'].fillna(0, inplace=True)
df['Years of Credit History'].fillna(0, inplace=True)

In [17]:
TARGET_NAME = 'Credit Default'
BASE_FEATURE_NAMES = df.columns.drop(TARGET_NAME).tolist()
NEW_FEATURE_NAMES = df.columns.drop([TARGET_NAME] + BASE_FEATURE_NAMES).tolist()

In [18]:
df[TARGET_NAME].value_counts()

0    5387
1    2113
Name: Credit Default, dtype: int64

In [19]:
NUM_FEATURE_NAMES = ['Annual Income', 'Years in current job', 'Tax Liens', 
                     'Number of Open Accounts', 'Years of Credit History', 'Maximum Open Credit', 
                     'Number of Credit Problems', 'Bankruptcies',
                     'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt', 'Credit Score']

CAT_FEATURE_NAMES = ['Home Ownership', 'Purpose','Term']

SELECTED_FEATURE_NAMES = NUM_FEATURE_NAMES + CAT_FEATURE_NAMES + NEW_FEATURE_NAMES

In [20]:
for i in SELECTED_FEATURE_NAMES:
    df[i].fillna(df[i].mode()[0], inplace=True)    

In [21]:
df['Annual Income'].mode()[0]

969475.0

In [22]:
#corr_with_target = df[BASE_FEATURE_NAMES + [TARGET_NAME]].corr().iloc[:-1, -1].sort_values(ascending=False)
#plt.figure(figsize=(10, 8))
#sns.barplot(x=corr_with_target.values, y=corr_with_target.index)
#plt.title('Correlation with target variable')
#plt.show()

In [23]:
#plt.figure(figsize = (20,15))

#sns.set(font_scale=1.4)

#corr_matrix = df[BASE_FEATURE_NAMES].corr()
#corr_matrix = np.round(corr_matrix, 2)
#corr_matrix[np.abs(corr_matrix) < 0.3] = 0

#sns.heatmap(corr_matrix, annot=True, linewidths=.5, cmap='GnBu')

#plt.title('Correlation matrix')
#plt.show()

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   int64  
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   float64
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7500 non-null   float64
 10  Purpose                       7500 non-null   int64  
 11  Term                          7500 non-null   int64  
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [25]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [26]:
scaler = StandardScaler()

df_norm = df.copy()
df_norm[NUM_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUM_FEATURE_NAMES])

df = df_norm.copy()

Save current dataset

In [27]:
UPDATED_DATASET_PATH = './new_train.csv'
df.to_csv(UPDATED_DATASET_PATH, index=False, encoding='utf-8')

Model learning

In [28]:
X = df[SELECTED_FEATURE_NAMES]
y = df[TARGET_NAME]

In [29]:
from lightgbm import LGBMClassifier

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=21, stratify=y)

display(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True))

0    0.718286
1    0.281714
Name: Credit Default, dtype: float64

0    0.718222
1    0.281778
Name: Credit Default, dtype: float64

In [35]:
%%time
tree = LGBMClassifier(random_state=21, 
                                 class_weight={0:1, 1:3.6},
                                 n_estimators=100
                                )
tree.fit(X_train, y_train)

evaluate_preds(tree, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.99      0.87      0.92      3771
           1       0.74      0.98      0.85      1479

    accuracy                           0.90      5250
   macro avg       0.87      0.92      0.88      5250
weighted avg       0.92      0.90      0.90      5250

TEST

              precision    recall  f1-score   support

           0       0.82      0.71      0.76      1616
           1       0.45      0.60      0.52       634

    accuracy                           0.68      2250
   macro avg       0.64      0.66      0.64      2250
weighted avg       0.72      0.68      0.69      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1150  466
1                251  383
CPU times: user 850 ms, sys: 608 µs, total: 851 ms
Wall time: 266 ms


In [36]:
pred_train = tree.predict(X_train)
pred_test = tree.predict(X_test)

pred_proba_test = tree.predict_proba(X_test)

WEGHTED

In [37]:
from sklearn.metrics import f1_score, precision_score, recall_score, \
                            classification_report, confusion_matrix, accuracy_score, f1_score

In [38]:
accuracy_train = accuracy_score(y_train, pred_train)
accuracy_test = accuracy_score(y_test, pred_test)
print(f'Accuracy на трейне {accuracy_train}')
print(f'Accuracy на тесте {accuracy_test}')

Accuracy на трейне 0.8984761904761904
Accuracy на тесте 0.6813333333333333


In [39]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.82      0.71      0.76      1616
           1       0.45      0.60      0.52       634

    accuracy                           0.68      2250
   macro avg       0.64      0.66      0.64      2250
weighted avg       0.72      0.68      0.69      2250



In [40]:
from sklearn.metrics import r2_score

In [41]:
r2_v = r2_score(y_test, pred_test)
r2_v

-0.5746029453103036

In [42]:
tf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                2500 non-null   object 
 1   Annual Income                 1987 non-null   float64
 2   Years in current job          2414 non-null   object 
 3   Tax Liens                     2500 non-null   float64
 4   Number of Open Accounts       2500 non-null   float64
 5   Years of Credit History       2500 non-null   float64
 6   Maximum Open Credit           2500 non-null   float64
 7   Number of Credit Problems     2500 non-null   float64
 8   Months since last delinquent  1142 non-null   float64
 9   Bankruptcies                  2497 non-null   float64
 10  Purpose                       2500 non-null   object 
 11  Term                          2500 non-null   object 
 12  Current Loan Amount           2500 non-null   float64
 13  Cur

In [43]:
tf.describe(include='object')

Unnamed: 0,Home Ownership,Years in current job,Purpose,Term
count,2500,2414,2500,2500
unique,4,11,14,2
top,Home Mortgage,10+ years,debt consolidation,Short Term
freq,1225,810,1973,1861


In [44]:
k = 1
for i in tf['Term'].unique():
    tf.loc[tf.Term == i, 'Term'] = k
    k = k + 1
tf['Term']=  pd.to_numeric(tf.Term, errors='coerce')

In [45]:
t = ['Home Ownership', 'Years in current job', 'Purpose']
for tt in t:
    k = 1
    for i in tf[tt].unique():
        tf.loc[tf[tt] == i, tt] = k
        k = k + 1
    tf[tt]=  pd.to_numeric(tf[tt], errors='coerce')

In [46]:
for tt in t:
    print(tf[tt].unique())

[1 2 3 4]
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. nan]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14]


In [47]:
tf['Years in current job'].fillna(0, inplace=True)
tf['Home Ownership'].fillna(0, inplace=True)
tf['Years of Credit History'].fillna(0, inplace=True)

In [48]:
tf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                2500 non-null   int64  
 1   Annual Income                 1987 non-null   float64
 2   Years in current job          2500 non-null   float64
 3   Tax Liens                     2500 non-null   float64
 4   Number of Open Accounts       2500 non-null   float64
 5   Years of Credit History       2500 non-null   float64
 6   Maximum Open Credit           2500 non-null   float64
 7   Number of Credit Problems     2500 non-null   float64
 8   Months since last delinquent  1142 non-null   float64
 9   Bankruptcies                  2497 non-null   float64
 10  Purpose                       2500 non-null   int64  
 11  Term                          2500 non-null   int64  
 12  Current Loan Amount           2500 non-null   float64
 13  Cur

In [49]:
for i in SELECTED_FEATURE_NAMES:
    tf[i].fillna(tf[i].mode()[0], inplace=True)    

In [50]:
tf_norm = tf.copy()
tf_norm[NUM_FEATURE_NAMES] = scaler.fit_transform(tf_norm[NUM_FEATURE_NAMES])

tf = tf_norm.copy()

In [56]:
XT = tf[SELECTED_FEATURE_NAMES]

In [57]:
XT

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Home Ownership,Purpose,Term
0,-0.327678,-1.451245,-0.091141,-0.450224,-0.803220,-0.323879,-0.289398,-0.299225,-0.371832,-0.503642,-0.992788,-0.248330,1,1,1
1,-1.284626,-1.096487,-0.091141,-1.068551,1.981755,-0.433044,-0.289398,-0.299225,-0.374441,-0.661931,-1.382250,-0.278390,1,2,1
2,-0.174004,-0.741728,-0.091141,-0.244115,-0.637776,-0.334707,-0.289398,-0.299225,-0.370663,-0.396341,0.039653,3.917860,2,1,1
3,-0.092251,-0.386970,-0.091141,0.992538,-0.182805,-0.168202,-0.289398,-0.299225,-0.370129,-0.219968,0.804714,-0.252807,2,1,1
4,1.259550,-0.032212,-0.091141,-0.038006,0.727138,0.328576,-0.289398,-0.299225,-0.352759,0.341051,2.108349,-0.273913,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,-0.333820,-0.386970,-0.091141,0.580320,1.485423,-0.100166,1.497011,2.502503,2.723472,-0.353391,-0.279454,-0.248969,2,1,1
2496,-0.327678,1.032062,-0.091141,0.786429,-0.182805,0.679515,-0.289398,-0.299225,-0.362343,3.021663,1.275793,-0.248330,2,1,1
2497,-0.150764,1.032062,-0.091141,7.588024,-0.761859,0.658776,-0.289398,-0.299225,-0.363522,1.004799,1.615088,-0.280948,2,1,1
2498,-0.691522,-0.386970,-0.091141,0.580320,1.444062,0.155595,-0.289398,-0.299225,-0.368877,0.250779,-0.412885,-0.249609,1,1,1


In [58]:
ytv_pred = tree.predict(XT)
ytv_pred.shape

(2500,)

In [59]:
ytv_pred

array([0, 0, 1, ..., 0, 0, 1])

In [60]:
ytv_pred = pd.DataFrame(ytv_pred, columns = ['Credit Default'])

In [61]:
ytv_pred.index.names = ['Id']

In [62]:
ytv_pred.to_csv('Avoronkov_predictions_lgbm.csv', encoding='utf-8')