In [1]:
# Загрузка библиотек

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [24]:
# Загрузка файла с данными

data = pd.read_csv('credit_scoring_dataset.csv', index_col = 0)

# EDA

In [25]:
# Обзорная таблица
data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [27]:
# Количество строк и переменных
data.shape

(150000, 11)

In [28]:
# Типы переменных - ни одной текстовой
data.dtypes

SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

In [29]:
# Проверка пропущенных значений. Обнаружены у переменных MonthlyIncome и NumberOfDependents
data.isna().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [30]:
# Заполнение пропущенных значений медианной
data['MonthlyIncome'].fillna(data['MonthlyIncome'].median(), inplace = True)
data['NumberOfDependents'].fillna(data['NumberOfDependents'].median(), inplace = True)

In [31]:
# Повторная проверка пропущенных значений
data.isna().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [38]:
# Описательная статистика данных. Настораживает большое стандартное отклонение в переменной "Общий баланс средств" и 
# большое максимальное значение в переменных количества дней просрочки и количества займов
data.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6418.455,8.45276,0.265973,1.01824,0.240387,0.737413
std,0.249746,249.755371,14.771866,4.192781,2037.818523,12890.4,5.145951,4.169304,1.129771,4.155179,1.107021
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3903.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,7400.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [44]:
# Проверка переменной "Общий баланс средств". Странных выбросов нет. 
# Получается, такова специфика - абсолютное большинство клиентов имеет низкий баланс на счете и малая часть - высокий.
data_sorted = data.sort_values('RevolvingUtilizationOfUnsecuredLines', ascending=False)
data_sorted[:20]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
85489,0,50708.0,55,0,0.221757,38000.0,7,0,2,0,0.0
31414,0,29110.0,58,0,0.40256,10000.0,7,0,1,0,0.0
16956,0,22198.0,38,0,2312.0,5400.0,5,0,2,0,0.0
149160,0,22000.0,38,0,1.08002,3973.0,7,0,2,0,3.0
149279,0,20514.0,42,0,0.062102,9902.0,1,1,0,0,2.0
117315,0,18300.0,45,0,0.221582,12500.0,5,0,1,0,2.0
21978,0,17441.0,51,1,0.354072,14770.0,14,0,2,0,0.0
124533,0,13930.0,45,0,4902.0,5400.0,4,0,2,0,0.0
72592,0,13498.0,38,0,0.347428,4800.0,6,0,1,0,2.0
71705,0,13400.0,43,0,0.995801,5000.0,11,0,1,0,1.0


In [50]:
# Проверка переменной "Количество просрочек 30-59 дней за последние 2 года". 
# Очень резкий переход с неадекватных значений 96-98 до более адекватных 0-13. 
# При этом почти у всех этих клиентов нет открытых кредитов. Лучше эти строки удалить.
data_sorted = data.sort_values('NumberOfTime30-59DaysPastDueNotWorse', ascending=False)
data_sorted[260:280]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
98603,0,1.0,52,98,0.0,9016.0,0,98,0,98,2.0
136682,1,1.0,34,98,0.0,3144.0,0,98,0,98,2.0
81460,1,1.0,30,98,0.0,800.0,0,98,0,98,0.0
73213,1,1.0,32,98,0.0,1800.0,0,98,0,98,0.0
94106,1,1.0,34,96,0.0,3500.0,0,96,0,96,1.0
120049,0,1.0,46,96,0.051765,4616.0,0,96,0,96,0.0
84167,1,1.0,29,96,0.0,2800.0,0,96,0,96,2.0
69478,1,1.0,33,96,0.008047,4100.0,0,96,0,96,0.0
41944,1,1.0,26,96,0.0,5400.0,0,96,0,96,0.0
80533,1,0.998833,48,13,0.348064,12083.0,15,0,1,0,3.0


In [56]:
# Проверка переменных количества кредитов. Оказывается, это не выброс - таких значений много.
data_sorted = data.sort_values('NumberOfOpenCreditLinesAndLoans', ascending=False)
data_sorted[:30]

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
30587,0,0.003032,53,0,5.967504,8000.0,58,0,54,0,0.0
137095,1,0.050855,42,0,0.100814,42156.0,57,0,1,0,2.0
22683,0,0.020452,51,0,0.032213,3600.0,57,0,0,0,0.0
107427,0,0.013016,48,0,0.025088,7373.0,56,0,0,0,0.0
51026,0,0.020226,70,0,0.074935,8900.0,56,0,0,0,2.0
66979,0,0.028601,62,0,0.711833,5416.0,54,0,5,0,0.0
54115,0,0.052349,60,0,0.637685,12800.0,54,0,2,0,2.0
105480,0,0.477322,49,0,6.006888,9000.0,54,0,7,0,3.0
16537,0,0.098127,61,0,0.323779,10500.0,54,0,2,0,0.0
112934,0,0.911758,67,0,0.8655,20750.0,53,0,5,0,3.0


In [53]:
# Значит, удаляем только строки с неадекватным значением количества фактов просрочки за последние 2 года
data_altered = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 15]

In [54]:
# Описательная статистика обновленной таблицы
data_altered.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0
mean,0.065978,6.057508,52.327634,0.245794,353.628957,6423.095,8.467932,0.090456,1.020069,0.064823,0.73815
std,0.248245,249.979529,14.754942,0.69778,2039.594737,12901.08,5.138094,0.485527,1.12996,0.330073,1.107375
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029771,41.0,0.0,0.175998,3915.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.153491,52.0,0.0,0.367119,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.555601,63.0,0.0,0.870021,7400.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,13.0,329664.0,3008750.0,58.0,17.0,54.0,11.0,20.0


In [57]:
# Корреляционная матрица
# Целевая переменная наиболее зависима от количества фактов просрочки 90+ за последние 2 года 
# И наименее зависима от баланса средств
# Очевидно, что зависимы переменные количества открытых кредитов зависимы друг от друга, как и переменные числа фактов просрочки
corr = data_altered.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
SeriousDlqin2yrs,1.0,-0.00174379,-0.112968,0.27455,-0.00706909,-0.0165306,-0.0242312,0.314535,-0.00395866,0.26813,0.0483181
RevolvingUtilizationOfUnsecuredLines,-0.00174379,1.0,-0.00595472,-0.0028186,0.00395512,0.00650612,-0.0113679,-0.00180749,0.00620652,-0.00244142,0.00118031
age,-0.112968,-0.00595472,1.0,-0.0718673,0.0238598,0.0271794,0.144747,-0.0823839,0.0312583,-0.0698462,-0.217253
NumberOfTime30-59DaysPastDueNotWorse,0.27455,-0.0028186,-0.0718673,1.0,0.00349106,2.65989e-05,0.080069,0.218147,0.0427624,0.305876,0.0656184
DebtRatio,-0.00706909,0.00395512,0.0238598,0.00349106,1.0,-0.0180698,0.0491833,-0.00984797,0.119861,-0.00419532,-0.044615
MonthlyIncome,-0.0165306,0.00650612,0.0271794,2.65989e-05,-0.0180698,1.0,0.0865742,-0.0177579,0.116041,-0.00991151,0.0661921
NumberOfOpenCreditLinesAndLoans,-0.0242312,-0.0113679,0.144747,0.080069,0.0491833,0.0865742,1.0,-0.0938686,0.432664,-0.0219063,0.0731504
NumberOfTimes90DaysLate,0.314535,-0.00180749,-0.0823839,0.218147,-0.00984797,-0.0177579,-0.0938686,1.0,-0.062472,0.294638,0.0313338
NumberRealEstateLoansOrLines,-0.00395866,0.00620652,0.0312583,0.0427624,0.119861,0.116041,0.432664,-0.062472,1.0,-0.0206965,0.128968
NumberOfTime60-89DaysPastDueNotWorse,0.26813,-0.00244142,-0.0698462,0.305876,-0.00419532,-0.00991151,-0.0219063,0.294638,-0.0206965,1.0,0.0376314


In [61]:
# Среднее значение переменной NumberOfTimes90DaysLate в зависимости от значения целевой переменной
data.groupby('SeriousDlqin2yrs')['NumberOfTimes90DaysLate'].mean()

SeriousDlqin2yrs
0    0.135225
1    2.091362
Name: NumberOfTimes90DaysLate, dtype: float64

In [58]:
# Уберем переменную числа открытых кредитов под залог жилья
del data_altered['NumberRealEstateLoansOrLines']

In [59]:
# Итоговая таблица состоит из 10 переменных и 149731 строк
data_altered.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0,149731.0
mean,0.065978,6.057508,52.327634,0.245794,353.628957,6423.095,8.467932,0.090456,0.064823,0.73815
std,0.248245,249.979529,14.754942,0.69778,2039.594737,12901.08,5.138094,0.485527,0.330073,1.107375
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029771,41.0,0.0,0.175998,3915.0,5.0,0.0,0.0,0.0
50%,0.0,0.153491,52.0,0.0,0.367119,5400.0,8.0,0.0,0.0,0.0
75%,0.0,0.555601,63.0,0.0,0.870021,7400.0,11.0,0.0,0.0,1.0
max,1.0,50708.0,109.0,13.0,329664.0,3008750.0,58.0,17.0,11.0,20.0


# Логистическая регрессия