# Курсовой проект для курса "Python для Data Science. Продолжение."
#### Исполнитель Васильев А.

### Задача
Требуется, на основании имеющихся данных о клиентах банка, построить модель, используя обучающий датасет, для прогнозирования невыполнения долговых обязательств по текущему кредиту. Выполнить прогноз для примеров из тестового датасета.

In [1]:
import numpy as np
import pandas as pd
import pickle
import random

from scipy.stats import shapiro
from scipy.stats import probplot
from scipy.stats import ttest_ind, mannwhitneyu
from scipy.stats import chi2_contingency
from statsmodels.stats.weightstats import zconfint

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgbm
import catboost as catb

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
pd.options.display.max_columns = 100

import warnings
warnings.filterwarnings('ignore')

In [3]:
def get_classification_report(y_train_true, y_train_pred, y_valid_true, y_valid_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('VALID\n\n' + classification_report(y_valid_true, y_valid_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_valid_true, y_valid_pred))

In [4]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1) 

In [5]:
def get_feature_importances(feature_names, feature_importances, get_top=None):
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    
    if get_top is not None:
        return feature_importances['feature'][:get_top].tolist()

**Описание датасета**

* **Home Ownership** - домовладение
* **Annual Income** - годовой доход
* **Years in current job** - количество лет на текущем месте работы
* **Tax Liens** - налоговые льготы
* **Number of Open Accounts** - количество открытых счетов
* **Years of Credit History** - количество лет кредитной истории
* **Maximum Open Credit** - наибольший открытый кредит
* **Number of Credit Problems** - количество проблем с кредитом
* **Months since last delinquent** - количество месяцев с последней просрочки платежа
* **Bankruptcies** - банкротства
* **Purpose** - цель кредита
* **Term** - срок кредита
* **Current Loan Amount** - текущая сумма кредита
* **Current Credit Balance** - текущий кредитный баланс
* **Monthly Debt** - ежемесячный долг
* **Credit Score** - оценка благонадежности клиента (скоринговый балл, полученный из другого источника)
* **Credit Default** - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

In [6]:
TRAIN_DATASET_PATH = 'datasets/course_project_train.csv'
TEST_DATASET_PATH = 'datasets/course_project_test.csv'

PREDICTED_CREDIT_DEFAULT_PATH  = 'AVasilev_predictions.csv'

#### Обзор обучающего датасета

In [7]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)
df_train.shape

(7500, 17)

In [8]:
df_train.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [9]:
df_train['Credit Default'].value_counts(normalize=True)

0    0.718267
1    0.281733
Name: Credit Default, dtype: float64

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [11]:
print('Количество не заполненных значений:')
df_train.isnull().sum()

Количество не заполненных значений:


Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

### Обработка выбросов

**Обучающий датасет**

In [12]:
df_train.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,1366392.0,0.030133,11.130933,18.317467,945153.7,0.17,34.6926,0.117152,11873180.0,289833.2,18314.454133,1151.087498,0.281733
std,845339.2,0.271604,4.908924,7.041946,16026220.0,0.498598,21.688806,0.347192,31926120.0,317871.4,11926.764673,1604.451418,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1640137.0,0.0,14.0,21.8,793501.5,0.0,50.0,0.0,519882.0,360406.2,23818.0,743.0,1.0
max,10149340.0,7.0,43.0,57.7,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


*Maximum Open Credit*

In [13]:
median_open_credit = df_train['Maximum Open Credit'].median()
df_train.loc[df_train['Maximum Open Credit'] > 1e+07, 'Maximum Open Credit'] = median_open_credit

*Current Loan Amount*

In [14]:
median_loan_ammount = df_train['Current Loan Amount'].median()
df_train.loc[df_train['Current Loan Amount'] > 1e+06, 'Current Loan Amount'] = median_loan_ammount

*Credit Score*

In [15]:
df_train.loc[df_train['Credit Score'] > 751, 'Credit Score'] = df_train.loc[df_train['Credit Score'] > 751, 'Credit Score'] / 10

### Обработка пропусков обучающего датасета

##### Annual Income

In [16]:
for i in df_train.index[df_train['Annual Income'].isnull()]:
    monthly_debt = df_train.iloc[i, 14]
    df_train.iat[i, 1] = df_train.loc[(df_train['Monthly Debt'] < monthly_debt * 1.2) & 
                                      (df_train['Monthly Debt'] > monthly_debt * 0.8), 'Monthly Debt'].median()

In [17]:
med_anual_income = df_train['Annual Income'].median()
df_train.loc[df_train['Annual Income'].isnull(), 'Annual Income'] = med_anual_income

##### Years in current job

In [18]:
job_years = df_train['Years in current job'].mode()[0]
df_train.loc[df_train['Years in current job'].isnull(), 'Years in current job'] = job_years

##### Credit Score

In [19]:
credit_score = {}
for value in df_train['Years of Credit History'].unique():
    credit_score[value] = df_train.loc[df_train['Years of Credit History'] == value, 'Credit Score'].median()

In [20]:
for i in df_train.index[df_train['Credit Score'].isnull()]:
    cred_history_years = df_train.iloc[i, 5]
    df_train.iat[i, 15] = credit_score[cred_history_years]

In [21]:
median_credit_score = df_train['Credit Score'].median()
df_train.loc[df_train['Credit Score'].isnull(), 'Credit Score'] = median_credit_score

max_credit_score = df_train['Credit Score'].max()
min_credit_score = df_train['Credit Score'].min()

df_train.loc[df_train['Credit Score'].isnull() & (df_train['Credit Default'] == 0), 'Credit Score'] = min_credit_score

df_train.loc[df_train['Credit Score'].isnull(), 'Credit Score'] = max_credit_score

###### для последующего заполнения пропусков в тестово датасете
credit_score = {}
for value in df_train['Years of Credit History'].unique():
    credit_score[value] = df_train.loc[df_train['Years of Credit History'] == value, 'Credit Score'].median()

median_credit_score = df_train['Credit Score'].median()

##### Bankruptcies

In [22]:
bankruptcies = {}
for value in df_train['Credit Score'].unique():
    bankruptcies[value] = df_train.loc[df_train['Credit Score'] == value, 'Bankruptcies'].median()

In [23]:
for i in df_train.index[df_train['Bankruptcies'].isnull()]:
    cred_score = df_train.iloc[i, 15]
    df_train.iat[i, 9] = bankruptcies[cred_score]

##### Months since last delinquent

Т.к. пропущенных значений больше половины, удаляю данный признак из датасета

In [24]:
df_train = df_train.drop(columns='Months since last delinquent')

In [25]:
df_train.isnull().sum()

Home Ownership               0
Annual Income                0
Years in current job         0
Tax Liens                    0
Number of Open Accounts      0
Years of Credit History      0
Maximum Open Credit          0
Number of Credit Problems    0
Bankruptcies                 0
Purpose                      0
Term                         0
Current Loan Amount          0
Current Credit Balance       0
Monthly Debt                 0
Credit Score                 0
Credit Default               0
dtype: int64

### Создаю дополнительные признаки

**Is Loan Big - равен 1, если текущая сумма кредита больше медианного значения**

In [26]:
med_cur_ammount = df_train['Current Loan Amount'].median()
is_big_loan = [1 if el > med_cur_ammount else 0 for el in df_train['Current Loan Amount']]
df_train['Is Loan Big'] = is_big_loan

**Mean Home Ownership Income - средний доход домовладений по типам**

In [27]:
home_own_types = df_train['Home Ownership'].unique()

In [28]:
mean_home_own_income_by_type = [df_train.loc[df_train['Home Ownership'] == el, 'Annual Income'].mean() 
                                for el in home_own_types]

In [29]:
df_mean_income = pd.DataFrame([home_own_types, mean_home_own_income_by_type], index=['Home Ownership', 'Mean Annual Income'])
df_mean_income = df_mean_income.T

In [30]:
mean_home_own_income_temp = [df_mean_income.loc[df_mean_income['Home Ownership'] == el, 'Mean Annual Income'].tolist()
                             for el in df_train['Home Ownership']]

mean_home_own_income = [el[0] for el in mean_home_own_income_temp]

In [31]:
df_train['Mean Home Ownership Income'] = mean_home_own_income

**Debt-to-income ratio - Отношение ежемесячного долга домовладения к его ежемесячному доходу**

In [32]:
df_train['Debt-to-income ratio'] = round(df_train['Monthly Debt'] / (df_train['Annual Income'] / 12), 3)

**Is Payment High - Равен 1, если Debt-to-income ratio более 0.43.**  
Считается, что при более высокой доле обязательств по кредиту относительно общего дохода, становится затруднительно отвечать по своим финансовым обязательствам. Данное обстоятельство ведет к повышению риска просрочки платежей по ним.

In [33]:
is_payment_high = [1 if el > 0.43 else 0 for el in df_train['Debt-to-income ratio']]
is_payment_high[:5]

[0, 0, 0, 0, 0]

In [34]:
df_train['Is Payment High'] = is_payment_high

### Обзор категориальных переменных

#### Обучающий датасет

In [35]:
for cat_colname in df_train.select_dtypes(include='object').columns:
    print(str(cat_colname) + '\n\n' + str(df_train[cat_colname].value_counts()) + '\n' + '*' * 100 + '\n')

Home Ownership

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64
****************************************************************************************************

Years in current job

10+ years    2703
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64
****************************************************************************************************

Purpose

debt consolidation      5944
other                    665
home improvements        412
business loan            129
buy a car                 96
medical bills             71
major purchase            40
take a trip               37
buy house                 34
small business            26
wedding                   15
moving                    11
educational expenses      10
vacation  

In [36]:
df_train['Term Binary'] = df_train['Term'].map({'Short Term':'0', 'Long Term':'1'}).astype(int)

In [37]:
df_train['Home Ownership digit'] = df_train['Home Ownership'].map({'Home Mortgage':'0', 'Rent':'1',\
                                    'Own Home': '2', 'Have Mortgage': '3'}).astype(int)

df_train.loc[(df_train['Years in current job'] == '< 1 year') | (df_train['Years in current job'] == '1 year'), 'Years in current job'] = 1
df_train.loc[(df_train['Years in current job'] == '2 years') | (df_train['Years in current job'] == '3 years'), 'Years in current job'] = 2
df_train.loc[(df_train['Years in current job'] == '4 years') | (df_train['Years in current job'] == '5 years'), 'Years in current job'] = 3
df_train.loc[(df_train['Years in current job'] == '6 years') | (df_train['Years in current job'] == '7 years'), 'Years in current job'] = 4
df_train.loc[(df_train['Years in current job'] == '8 years') | (df_train['Years in current job'] == '9 years'), 'Years in current job'] = 5
df_train.loc[df_train['Years in current job'] == '10+ years', 'Years in current job'] = 6

In [38]:
obj_columns = ['Purpose', 'Years in current job']

In [39]:
train_dummies = pd.get_dummies(df_train[obj_columns])
df_train = pd.concat([df_train, train_dummies], axis=1)

In [40]:
df_train.shape

(7500, 48)

### Анализ целевой переменной

In [None]:
df_base = pd.read_csv(TRAIN_DATASET_PATH)

In [None]:
df_base.head()

In [None]:
TARGET_NAME = 'Credit Default'
BASE_FEATURE_NAMES = df_base.columns.drop(['Months since last delinquent', 'Credit Default']).tolist()
BASE_FEATURE_NAMES.append('Is Loan Big')
BASE_FEATURE_NAMES.append('Mean Home Ownership Income')
BASE_FEATURE_NAMES.append('Debt-to-income ratio')
BASE_FEATURE_NAMES.append('Is Payment High')
NEW_FEATURE_NAMES = df_train.columns.drop([TARGET_NAME] + BASE_FEATURE_NAMES).tolist()

**Обзор распределения**

In [None]:
df_train[TARGET_NAME].value_counts()

In [None]:
plt.figure(figsize=(8, 5))

sns.countplot(x=TARGET_NAME, data=df_train)

plt.title('Target variable distribution')
plt.show()

**Корреляция с базовыми признаками**

In [None]:
corr_with_target = df_train[BASE_FEATURE_NAMES + [TARGET_NAME]].corr().iloc[:-1, -1].sort_values(ascending=False)

plt.figure(figsize=(10, 8))

sns.barplot(x=corr_with_target.values, y=corr_with_target.index)

plt.title('Correlation with target variable')
plt.show()

На основании этого графика, принял решение произвести оценку признаков с корреляцией более 0.01 по модулю.

**Оценка признака "Credit Score" в разрезе целевой переменной**

In [None]:
credit_score_with_target_s = df_train[['Credit Score', TARGET_NAME]].sample(2000)
credit_score_s = credit_score_with_target_s['Credit Score']
credit_score_target_0 = credit_score_s[credit_score_with_target_s[TARGET_NAME] == 0]
credit_score_target_1 = credit_score_s[credit_score_with_target_s[TARGET_NAME] == 1]

plt.figure(figsize=(10, 5))

sns.kdeplot(credit_score_target_0, shade=True, label='No default', color='g')
sns.kdeplot(credit_score_target_1, shade=True, label='Default', color='r')

plt.xlabel('Credit Score')
plt.title('Credit Score grouped by target variable')
plt.show()

**Наблюдение**  
Есть вероятность, что между признаком "Credit Score" и целевой переменной существует некоторая функциональная зависимость.

**Гипотеза** 
* Нулевая гипотеза: средние значения в двух независимых выборках равны
* Альтернативная гипотеза: средние значения в двух независимых выборках различаются
* Критерий: критерий Стьюдента (t-тест)
* Уровень значимости $\normalsize \alpha$: 0.05
* Критическая область: двухсторонняя

*Проверка распределения признака на "нормальность" с помощью критерия Шапиро-Уилка*

In [None]:
shapiro(credit_score_s)

Так как значение p-value равно 0, можно сделать вывод, что гипотеза о "нормальности" отвергается.

*Для достоверности провожу визуальную оценку распределения признака, а так же сторою QQ-график*

In [None]:
plt.figure(figsize=(12, 6))

ax1 = plt.subplot(121)
ax1.set_xlabel('Credit Score')
ax1.set_ylabel('Count')
ax1.set_title('Credit Score distribution')
credit_score_s.hist()

plt.subplot(122)
probplot(credit_score_s, dist='norm', plot=plt)

plt.show()

*Оцениваю эквивалентность мат. ожиданий в исследуемых группах с помощью критерия Манна-Уитни.*

In [None]:
mannwhitneyu(credit_score_target_0, credit_score_target_1)

Согласно значению p-value, гипотеза о равности мат. ожиданий отвергается.

*Строю доверительные интервалы для средних значений каждой из двух групп и сравним их для дополнительной проверки.*

In [None]:
plt.figure(figsize=(8, 5))

sns.pointplot(x=TARGET_NAME, y='Credit Score', data=credit_score_with_target_s, capsize=.1)

plt.title('Confidence intervals (95 %) for Credit Score')
plt.show()

График подтверждает, что интервалы не пересекаются, что подтверждает результаты полученные с помощью критерия Манна-Уитни.

Полученные разельтаты означают, что группы из которых взяты данные выборки, с допускаемой вероятностью 95%, имеют различные распределения и этот признак может быть полезен для определения значения целевой переменной.

**Оценка признака "Annual Income" в разрезе целевой переменной**

In [None]:
annual_income_with_target_s = df_train[['Annual Income', TARGET_NAME]].sample(1000)
annual_income_s = annual_income_with_target_s['Annual Income']
annual_income_target_0 = annual_income_s[annual_income_with_target_s[TARGET_NAME] == 0]
annual_income_target_1 = annual_income_s[annual_income_with_target_s[TARGET_NAME] == 1]

**Наблюдение**  
Есть вероятность, что между признаком "Annual Income" и целевой переменной существует некоторая функциональная зависимость.

**Гипотеза** 
* Нулевая гипотеза: средние значения в двух независимых выборках равны
* Альтернативная гипотеза: средние значения в двух независимых выборках различаются
* Критерий: критерий Стьюдента (t-тест)
* Уровень значимости $\normalsize \alpha$: 0.05
* Критическая область: двухсторонняя

*Проверка распределения признака на "нормальность" с помощью критерия Шапиро-Уилка*

In [None]:
shapiro(annual_income_s)

Так как значение p-value сильно меньше 0.05, можно сделать вывод, что гипотеза о "нормальности" отвергается.

*Оцениваю эквивалентность мат. ожиданий в исследуемых группах с помощью критерия Манна-Уитни.*

In [None]:
mannwhitneyu(annual_income_target_0, annual_income_target_1)

Согласно значению p-value, гипотеза о равности мат. ожиданий отвергается.  
Полученные разельтаты означают, что группы из которых взяты данные выборки, с допускаемой вероятностью 95%, имеют различные распределения и этот признак может быть полезен для определения значения целевой переменной.

### Анализ признакового пространства

**Матрица корреляций**

In [None]:
plt.figure(figsize = (12,8))

sns.set(font_scale=0.5)
sns.heatmap(df_train[BASE_FEATURE_NAMES].corr().round(3), annot=True, linewidths=.5, cmap='GnBu')

plt.title('Correlation matrix')
plt.show()

### Отбор признаков

In [None]:
NUM_FEATURE_NAMES = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts', 'Years of Credit History', 'Maximum Open Credit',
                     'Number of Credit Problems', 'Bankruptcies', 'Current Loan Amount', 'Current Credit Balance', 
                     'Monthly Debt', 'Credit Score', 'Mean Home Ownership Income', 'Debt-to-income ratio']

CAT_FEATURE_NAMES = ['Home Ownership', 'Purpose', 'Term']

NEW_BINARY_FEATURES = ['Is Loan Big', 'Is Payment High']

SELECTED_FEATURE_NAMES = NUM_FEATURE_NAMES + NEW_BINARY_FEATURES + NEW_FEATURE_NAMES

### Приведение типов для модели CatBoost

In [None]:
for colname in CAT_FEATURE_NAMES:
    df_train[colname] = pd.Categorical(df_train[colname])
    
df_train[CAT_FEATURE_NAMES].dtypes

### Нормализация данных

In [None]:
scaler = StandardScaler()

df_norm = df_train.copy()
df_norm[NUM_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUM_FEATURE_NAMES])

df_train = df_norm.copy()
df_train.head()

### Разбиение на train и valid

In [None]:
X = df_train[SELECTED_FEATURE_NAMES]
y = df_train[TARGET_NAME]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=27)

### Балансировка целевой переменной

In [None]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME)
    
df_balanced[TARGET_NAME].value_counts()

In [None]:
X_train = df_balanced.drop(columns=TARGET_NAME)
y_train = df_balanced[TARGET_NAME]

### Обучение и оценка модели на тестовых данных

### Обучение модели

In [None]:
%%time

final_model = catb.CatBoostClassifier(l2_leaf_reg=10, silent=True, random_state=27)
final_model.fit(X_train, y_train)

y_train_pred = final_model.predict(X_train)
y_valid_pred = final_model.predict(X_valid)
y_valid_pred_probs = final_model.predict_proba(X_valid)

get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

### Обучение финальной модели

In [None]:
important_features_top = get_feature_importances(X_train.columns, final_model.feature_importances_, get_top=15)

**Для обучения финальной модели использую только 15 самых важных признаков датасета**

In [None]:
%%time

final_model = catb.CatBoostClassifier(l2_leaf_reg=30, silent=True, random_state=27)
final_model.fit(X_train[important_features_top], y_train)

y_train_pred = final_model.predict(X_train[important_features_top])
y_valid_pred = final_model.predict(X_valid[important_features_top])
y_valid_pred_probs = final_model.predict_proba(X_valid[important_features_top])

get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

### Подготовка тестового датасета

#### Обзор тестового датасета

In [None]:
df_test = pd.read_csv(TEST_DATASET_PATH)
df_test.shape

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.isnull().sum()

**Обработка выбросов. Тестовый датасет**

In [None]:
df_test.describe()

*Maximum Open Credit*

In [None]:
df_test.loc[df_test['Maximum Open Credit'] > 1e+07, 'Maximum Open Credit'] = median_open_credit

*Current Loan Amount*

In [None]:
df_test.loc[df_test['Current Loan Amount'] > 1e+06, 'Current Loan Amount'] = median_loan_ammount

*Credit Score*

In [None]:
df_test.loc[df_test['Credit Score'] > 751, 'Credit Score'] = df_test.loc[df_test['Credit Score'] > 751, 'Credit Score'] / 10

### Обработка пропусков тестового датасета

##### Annual Income

In [None]:
for i in df_test.index[df_test['Annual Income'].isnull()]:
    years = df_test.iloc[i, 2]
    df_test.iat[i, 1] = df_train.loc[(df_train['Monthly Debt'] < monthly_debt * 1.2) & 
                                      (df_train['Monthly Debt'] > monthly_debt * 0.8), 'Monthly Debt'].median()

In [None]:
df_test.loc[df_test['Annual Income'].isnull(), 'Annual Income'] = med_anual_income

##### Years in current job

In [None]:
df_test.loc[df_test['Years in current job'].isnull(), 'Years in current job'] = job_years

##### Credit Score

In [None]:
# Использую конструкцию try except, т.к. в словаре credit_score есть не все значения из тестового датасета. 
# Им присваиваю медианное значение
try:
    for i in df_test.index[df_test['Credit Score'].isnull()]:
        cred_history_years_1 = df_test.iloc[i, 5]
        df_test.iat[i, 15] = credit_score[cred_history_years_1]
except KeyError:
    df_test.iat[i, 15] = median_credit_score

In [None]:
df_test.loc[df_test['Credit Score'].isnull(), 'Credit Score'] = median_credit_score

##### Bankruptcies

In [None]:
# Аналогичная причина использования try except
try:
    for i in df_test.index[df_test['Bankruptcies'].isnull()]:
        cred_score = df_test.iloc[i, 15]
        df_test.iat[i, 9] = bankruptcies[cred_score]
except KeyError:
    df_test.iat[i, 9] = 0

In [None]:
df_test.loc[df_test['Bankruptcies'].isnull(), 'Bankruptcies'] = 0

##### Months since last delinquent

Т.к. пропущенных значений больше половины, удаляю данный признак из датасета

In [None]:
df_test = df_test.drop(columns='Months since last delinquent')

In [None]:
df_test.isnull().sum()

### Создаю дополнительные признаки

**Is Loan Big - равен 1, если текущая сумма кредита больше медианного значения**

In [None]:
is_big_loan_test = [1 if el > med_cur_ammount else 0 for el in df_test['Current Loan Amount']]

In [None]:
df_test['Is Loan Big'] = is_big_loan_test

**Mean Home Ownership Income - средний доход домовладений по типам**

In [None]:
mean_home_own_income_by_type = [df_test.loc[df_train['Home Ownership'] == el, 'Annual Income'].mean() 
                                for el in home_own_types]

In [None]:
df_mean_income = pd.DataFrame([home_own_types, mean_home_own_income_by_type], index=['Home Ownership', 'Mean Annual Income'])
df_mean_income = df_mean_income.T

In [None]:
mean_home_own_income_temp_test = [df_mean_income.loc[df_mean_income['Home Ownership'] == el, 
                                'Mean Annual Income'].tolist() for el in df_test['Home Ownership']]

mean_home_own_income_test = [el[0] for el in mean_home_own_income_temp_test]

In [None]:
df_test['Mean Home Ownership Income'] = mean_home_own_income_test

**Debt-to-income ratio - Отношение ежемесячного долга домовладения к его ежемесячному доходу**

In [None]:
df_test['Debt-to-income ratio'] = round(df_test['Monthly Debt'] / (df_test['Annual Income'] / 12), 3)

**Is Payment High - Равен 1, если Debt-to-income ratio более 0.43.**  
Считается, что при более высокой доле обязательств по кредиту относительно общего дохода, становится затруднительно отвечать по своим финансовым обязательствам. Данное обстоятельство ведет к повышению риска просрочки платежей по ним.

In [None]:
is_payment_high_test = [1 if el > 0.43 else 0 for el in df_test['Debt-to-income ratio']]

In [None]:
df_test['Is Payment High'] = is_payment_high_test

### Обзор категориальных переменных. Тестовый датасет

In [None]:
for cat_colname in df_test.select_dtypes(include='object').columns:
    print(str(cat_colname) + '\n\n' + str(df_test[cat_colname].value_counts()) + '\n' + '*' * 100 + '\n')

In [None]:
df_test['Term Binary'] = df_test['Term'].map({'Short Term':'0', 'Long Term':'1'}).astype(int)

In [None]:
df_test['Home Ownership digit'] = df_test['Home Ownership'].map({'Home Mortgage':'0', 'Rent':'1',\
                                    'Own Home': '2', 'Have Mortgage': '3'}).astype(int)

In [None]:
df_test.loc[(df_test['Years in current job'] == '< 1 year') | (df_test['Years in current job'] == '1 year'), 'Years in current job'] = 1
df_test.loc[(df_test['Years in current job'] == '2 years') | (df_test['Years in current job'] == '3 years'), 'Years in current job'] = 2
df_test.loc[(df_test['Years in current job'] == '4 years') | (df_test['Years in current job'] == '5 years'), 'Years in current job'] = 3
df_test.loc[(df_test['Years in current job'] == '6 years') | (df_test['Years in current job'] == '7 years'), 'Years in current job'] = 4
df_test.loc[(df_test['Years in current job'] == '8 years') | (df_test['Years in current job'] == '9 years'), 'Years in current job'] = 5
df_test.loc[df_test['Years in current job'] == '10+ years', 'Years in current job'] = 6

In [None]:
test_dummies = pd.get_dummies(df_test[obj_columns])
df_test = pd.concat([df_test, test_dummies], axis=1)

In [None]:
df_test['Purpose_renewable energy'] = 0

### Нормализация данных

In [None]:
df_norm_test = df_train.copy()
df_norm_test[NUM_FEATURE_NAMES] = scaler.transform(df_norm_test[NUM_FEATURE_NAMES])

df_test = df_norm_test.copy()

### Делаю предсказание на тестовом датасете

In [None]:
y_test_pred = final_model.predict(df_test[important_features_top])

In [None]:
df_test_preds = pd.DataFrame(data=y_test_pred, columns=[TARGET_NAME])
df_test_preds.shape

In [None]:
df_test_preds.head(10)

**Сохраняю получившийся датасет в файл .csv**

In [None]:
df_test_preds.to_csv(PREDICTED_CREDIT_DEFAULT_PATH, index=False, encoding='utf-8')