# [SF-DST] Credit Scoring
Прогнозирование вероятности дефолта заемщика

### Import

In [None]:
import numpy as np
import pandas as pd
import re
import random
import os
import math

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score, auc
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.metrics import cohen_kappa_score

from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#  проверка на работу в "google colab" или "локально"

if 'sample_data' in os.listdir():
    # project_dir = r'/content/'
    project_dir = r'/content/drive/MyDrive/Colab Notebooks/module_5/'
    print('Обнаружена среда выполнения Google Colab.')
    print('project_dir =>', project_dir)
else:
    project_dir = ''
    print('НЕ обнаружена среда выполнения Google Colab. Выбран режим локальной работы.')

In [None]:
os.listdir(project_dir)

# START

In [None]:
file_patch = project_dir + 'train.csv'

train = pd.read_csv(file_patch, encoding = 'ISO-8859-1', low_memory = False)
train.shape

In [None]:
file_patch = project_dir + 'test.csv'

test = pd.read_csv(file_patch, encoding = 'ISO-8859-1', low_memory = False)
test.shape

In [None]:
train.columns

In [None]:
train['train'] = 1

In [None]:
test['train'] = 0

In [None]:
data = train.append(test)

In [None]:
data.sample(9)

In [None]:
data.isna().sum()

In [None]:
sns.countplot(x='default', data=data)

In [None]:
data['default'].value_counts().idxmax(), data['default'].value_counts().max()

In [None]:
data['default'].value_counts().min() / data['default'].value_counts().max()

In [None]:
data['education'].value_counts()

In [None]:
data['education'] = data['education'].fillna(0)
data['education'].value_counts()

In [None]:
# ['client_id', 'education', 'sex', 'age', 'car', 'car_type',
#        'decline_app_cnt', 'good_work', 'bki_request_cnt', 'home_address',
#        'work_address', 'income', 'foreign_passport', 'default']


In [None]:

bin_cols = ['sex', 'car', 'car_type',  'foreign_passport', 'good_work']
cat_cols = ['education',  'home_address', 'work_address']
num_cols = ['age', 'decline_app_cnt', 'income', 'bki_request_cnt', 'score_bki', 'sna', 'region_rating']

drop_cols = ['client_id', 'train', 'app_date', 'first_time']
target_cols = ['default']



# for i in num_cols:
#     plt.figure()
#     plt.hist(data[i], bins=10)
#     plt.title(i)
#     plt.show()

# set(data.columns) - set(bin_cols + cat_cols + num_cols + drop_cols + target_cols)
# {'app_date', 'first_time', 'region_rating', 'score_bki', 'sna'}

In [None]:
education_dict = dict(zip(['SCH', 'GRD', 'UGR', 'PGR', 'ACD', '0'], [1, 2, 3, 4, 5, 0]))
# education_dict

data['education'] = data['education'].replace(to_replace = education_dict)
data['education'].value_counts()

## `app_date`, `first_time`

In [None]:
data = data[data['train'] == 1]

In [None]:
# data.sample(9)

## оценка корреляции

In [None]:
sns.set(font_scale=1)
plt.subplots(figsize=(10, 10))
sns.heatmap(data[num_cols].corr().abs(), vmin=0, vmax=1, square=True, annot=True, fmt=".2f", linewidths=0.1)

## Значимость непрерывных переменных

In [None]:
imp_num = pd.Series(f_classif(data[num_cols], data['default'])[0], index = num_cols)
imp_num.sort_values(inplace = True)
plt.subplots(figsize=(8, 8))
imp_num.plot(kind = 'barh')

##  Категориальные переменные

In [None]:
from sklearn.preprocessing import LabelEncoder
# import sklearn.preprocessing

In [None]:
label_encoder = LabelEncoder()

mapped_education = pd.Series(label_encoder.fit_transform(data['sex']))
print(dict(enumerate(label_encoder.classes_)))

In [None]:
# Для бинарных признаков мы будем использовать LabelEncoder

label_encoder = LabelEncoder()

for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])
    
# убедимся в преобразовании    
data.head()

In [None]:
data['education'].value_counts().index

In [None]:
imp_cat = pd.Series(mutual_info_classif(data[bin_cols + cat_cols], data['default'], 
                                     discrete_features = True), index = bin_cols + cat_cols)
imp_cat.sort_values(inplace = True)

plt.subplots(figsize=(8, 8))
imp_cat.plot(kind = 'barh')

## Подготовка данных к машинному обучению

In [None]:
X_cat = OneHotEncoder(sparse = False).fit_transform(data[cat_cols].values)
X_cat.shape

In [None]:
# Стандартизация числовых переменных

X_num = StandardScaler().fit_transform(data[num_cols].values)
X_num.shape

In [None]:
# Объединяем

X = np.hstack([X_num, data[bin_cols].values, X_cat])
Y = data['default'].values

X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.01, random_state=42)
X_train.shape , X_test.shape

In [None]:
model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

## Оценка качества модели

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

probs = model.predict_proba(X_test)
probs = probs[:,1]


fpr, tpr, threshold = roc_curve(y_test, probs)
roc_auc = roc_auc_score(y_test, probs)


In [None]:
plt.figure()
plt.subplots(figsize=(8, 8))
plt.plot([0, 1], label='Baseline', linestyle='--')
plt.plot(fpr, tpr, label = 'Regression')
plt.title('Logistic Regression ROC AUC = %0.3f' % roc_auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()

## Гиперпараметры

In [None]:
from sklearn.model_selection import GridSearchCV

# Добавим типы регуляризации
penalty = ['l1', 'l2']

# Зададим ограничения для параметра регуляризации
C = np.logspace(0, 4, 10)

# Создадим гиперпараметры
hyperparameters = dict(C=C, penalty=penalty)

model = LogisticRegression()
model.fit(X_train, y_train)

# Создаем сетку поиска с использованием 5-кратной перекрестной проверки
clf = GridSearchCV(model, hyperparameters, cv=5, verbose=0)

best_model = clf.fit(X_train, y_train)


In [None]:
# View best hyperparameters
print('Лучшее Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Лучшее C:', best_model.best_estimator_.get_params()['C'])

In [None]:
best_c = 2.7825594022071245

model = LogisticRegression(penalty='l2', C=best_c )
model.fit(X_train, y_train)

probs = model.predict_proba(X_test)
probs = probs[:,1]


fpr, tpr, threshold = roc_curve(y_test, probs)
roc_auc = roc_auc_score(y_test, probs)

In [None]:
plt.figure()
plt.subplots(figsize=(8, 8))
plt.plot([0, 1], label='Baseline', linestyle='--')
plt.plot(fpr, tpr, label = 'Regression')
plt.title('Logistic Regression ROC AUC = %0.3f' % roc_auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()

# END