In [1]:
# !pip install tensorflow
# !pip install Keras

In [2]:
# Import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf

# Import necessary modules
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import mean_squared_error
from math import sqrt

# Keras specific
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical 

In [3]:
TRAIN_FILEPATH = 'train.csv'
TEST_FILEPATH = 'test.csv'

train_df = pd.read_csv(TRAIN_FILEPATH)
test_df = pd.read_csv(TEST_FILEPATH)

target = 'Credit Default'
train_df[target].value_counts()

0    5387
1    2113
Name: Credit Default, dtype: int64

In [4]:
def clean_df(df):
    ann_inc_median = df['Annual Income'].median()
    cred_score_median = df['Credit Score'].median()

    max_open_cred_max = df['Maximum Open Credit'].max()
    curr_loan_max = df.loc[df['Current Loan Amount'] < 1 * 10**8, 'Current Loan Amount'].max()

    df['Annual Income'] = df['Annual Income'].fillna(ann_inc_median)
    df['Years in current job'] = df['Years in current job'].fillna('< 1 year')
    df = df.drop(columns=['Months since last delinquent'])
    df['Bankruptcies'] = df['Bankruptcies'].fillna(0)
    df['Credit Score'] = df['Credit Score'].fillna(cred_score_median)
    df.loc[df['Annual Income'] > 4 * 10**6, 'Annual Income'] = ann_inc_median
    df.loc[df['Maximum Open Credit'] > max_open_cred_max, 'Maximum Open Credit'] = max_open_cred_max
    df.loc[df['Current Loan Amount'] == 1 * 10**8, 'Current Loan Amount'] = curr_loan_max
    df.loc[df['Credit Score'] >= 3000, 'Credit Score'] //= 10
    
    return df

In [5]:
# Немного чистим
train_df = clean_df(train_df)
test_df = clean_df(test_df)

# Преобразуем категории в отдельные признаки
train_df = pd.get_dummies(train_df, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)
train_df.drop('Purpose_renewable energy', axis=1, inplace = True)

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Annual Income                   7500 non-null   float64
 1   Tax Liens                       7500 non-null   float64
 2   Number of Open Accounts         7500 non-null   float64
 3   Years of Credit History         7500 non-null   float64
 4   Maximum Open Credit             7500 non-null   float64
 5   Number of Credit Problems       7500 non-null   float64
 6   Bankruptcies                    7500 non-null   float64
 7   Current Loan Amount             7500 non-null   float64
 8   Current Credit Balance          7500 non-null   float64
 9   Monthly Debt                    7500 non-null   float64
 10  Credit Score                    7500 non-null   float64
 11  Credit Default                  7500 non-null   int64  
 12  Home Ownership_Home Mortgage    75

In [7]:
X = train_df.drop(target, axis=1)
y = train_df[target]

# Scaler
scaler = MinMaxScaler()

X = scaler.fit_transform(X)
test_df = scaler.fit_transform(test_df)

X.shape, test_df.shape

((7500, 38), (2500, 38))

In [8]:
# from collections import Counter
from imblearn.over_sampling import ADASYN
# print('Original dataset shape %s' % Counter(y_train))
ada = ADASYN(random_state=42)
X, y = ada.fit_resample(X, y)
# print('Resampled dataset shape %s' % Counter(y_train))

In [9]:
# one hot encode outputs
y = to_categorical(y)

count_classes = y.shape[1]
print(count_classes)

2


In [10]:
# Задаём слои
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=38))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [11]:
# Обучени модели
model.fit(X, y, epochs=100)

# Предсказание
pred_test = model.predict(test_df)
y_pred_test = np.rint(pred_test[:,1])

# Выгрузка
submit = pd.read_csv('sample_submission.csv')
submit['Credit Default'] = y_pred_test.astype('int8')
submit.to_csv('Credit_default_prediction3.csv', index=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
