# [SF-DST] Credit Scoring
Прогнозирование вероятности дефолта заемщика

### Import

In [None]:
import numpy as np
import pandas as pd
import re
import random
import os
import math
import datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import f_classif

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score, auc
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.metrics import cohen_kappa_score

from sklearn.feature_selection import f_classif, mutual_info_classif

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler



import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
def get_iqr(column_name, print_report=False):
    '''
        Функция для определения типовых параметров данных".
        На вход принимает столбец датафрейма
    '''
    if column_name.dtype in ['int', 'float']:
        perc25 = column_name.describe().loc['25%']
        perc75 = column_name.describe().loc['75%']
        iqr = perc75 - perc25
        range_left = perc25 - 1.5 * iqr
        range_right = perc75 + 1.5 * iqr
        if print_report:
            print('|  Медиана: ', column_name.median())
            print('|  range_left, range_right, iqr')
        return [column_name.min(), column_name.median(), column_name.max()], [range_left, iqr, range_right]
    else:
        print('Для данного типа IQR не возможен')
        return NaN

# get_iqr(data['age'], print_report=True)
# get_iqr(data['age'])[2]

In [None]:
#  проверка на работу в "google colab" или "локально"

if 'sample_data' in os.listdir():
    # project_dir = r'/content/'
    project_dir = r'/content/drive/MyDrive/Colab Notebooks/module_5/'
    print('Обнаружена среда выполнения Google Colab.')
    print('project_dir =>', project_dir)
else:
    project_dir = ''
    print('НЕ обнаружена среда выполнения Google Colab. Выбран режим локальной работы.')

In [None]:
os.listdir(project_dir)

# START

In [None]:
file_patch = project_dir + 'train.csv'

train = pd.read_csv(file_patch, encoding = 'ISO-8859-1', low_memory = False)
train.shape

In [None]:
file_patch = project_dir + 'test.csv'

test = pd.read_csv(file_patch, encoding = 'ISO-8859-1', low_memory = False)
test.shape

In [None]:
train.columns

In [None]:
train['train'] = 1

In [None]:
test['train'] = 0
test['default'] = -1

## Очистка данных

In [None]:
warn_column = ['score_bki', 'sna', 'decline_app_cnt', 'region_rating', 'bki_request_cnt', 'income', 'age']


for item in warn_column:
    print('-'*44)
    print('  ', item)
    a, b = get_iqr(train.query('train == 1')[item])
    print(a)
    print(b)


In [None]:
train.query('decline_app_cnt > 24')['decline_app_cnt'].value_counts()

In [None]:

train.drop(train.query('decline_app_cnt > 24').index, inplace=True)

In [None]:
# filter_con = 'score_bki > -0.535 or score_bki < -3.294'
filter_con = 'score_bki < -3.294'

# train.query(filter_con)['score_bki'].value_counts()
train.drop(train.query(filter_con).index, inplace=True)

In [None]:
filter_con2 = 'region_rating < 35'
train.query(filter_con2)['region_rating'].value_counts()

train.drop(train.query(filter_con2).index, inplace=True)

In [None]:
# bki_request_cnt

filter_con3 = 'bki_request_cnt > 20'
train.query(filter_con3)['bki_request_cnt'].value_counts()

train.drop(train.query(filter_con3).index, inplace=True)

In [None]:
# income

filter_con4 = 'income < 22_000'
train.query(filter_con4)['income'].value_counts()

train.drop(train.query(filter_con4).index, inplace=True)


In [None]:
# age

filter_con5 = 'age >= 68 or age <= 20'
train.query(filter_con5)['age'].value_counts()

train.drop(train.query(filter_con5).index, inplace=True)



## Объедининение df

In [None]:
data = train.append(test)

In [None]:
data.sample(9)

In [None]:
data.isna().sum()

In [None]:
sns.countplot(x='default', data=data)

In [None]:
data.query('train != -1')['default'].value_counts().idxmax(),\
data.query('train != -1')['default'].value_counts().max()

In [None]:
# доля дефолтов
round(data.query('train != -1')['default'].value_counts().min() / data.query('train != -1')['default'].value_counts().max(), 2)

In [None]:
data['education'].value_counts()

In [None]:
data['education'] = data['education'].fillna(0)
data['education'].value_counts()

In [None]:
# ['client_id', 'education', 'sex', 'age', 'car', 'car_type',
#        'decline_app_cnt', 'good_work', 'bki_request_cnt', 'home_address',
#        'work_address', 'income', 'foreign_passport', 'default']


In [None]:

bin_cols = ['sex', 'car', 'car_type',  'foreign_passport', 'good_work']
cat_cols = ['education',  'home_address', 'work_address']
num_cols = ['age', 'decline_app_cnt', 'income', 'bki_request_cnt', 'score_bki', 'sna', 'region_rating', 'first_time']

# drop_cols = ['client_id', 'train', 'app_date']
drop_cols = ['app_date']
target_cols = ['default']



# for i in num_cols:
#     plt.figure()
#     plt.hist(data[i], bins=10)
#     plt.title(i)
#     plt.show()

# {'app_date', 'first_time', 'region_rating', 'score_bki', 'sna'}

In [None]:
set(data.columns) - set(bin_cols + cat_cols + num_cols + drop_cols + target_cols)

### education

In [None]:
education_dict = dict(zip(['SCH', 'GRD', 'UGR', 'PGR', 'ACD', '0'], [1, 2, 3, 4, 0, 0]))
# education_dict

data['education'] = data['education'].replace(to_replace = education_dict)
data['education'].value_counts()

### `app_date`

In [None]:
data['app_date'].value_counts().loc['01JAN2014']

In [None]:
data['app_date'] = data['app_date'].apply(lambda x: datetime.datetime.strptime(x, "%d%b%Y"))

In [None]:
data['app_date'].min(), data['app_date'].max()

In [None]:
data['app_month'] = data['app_date'].dt.month
data['app_day'] = data['app_date'].dt.day
data['app_wday'] = data['app_date'].dt.weekday

In [None]:
data[['app_day', 'app_month', 'app_date']].sample(9)

In [None]:
num_cols += ['app_day', 'app_month', 'app_wday']
# num_cols

## Качество данных

## оценка корреляции

In [None]:
sns.set(font_scale=1)
plt.subplots(figsize=(10, 10))
sns.heatmap(data.query('train == 1')[num_cols].corr().abs(), vmin=0, vmax=1, square=True, annot=True, fmt=".2f", linewidths=0.1)

## Значимость непрерывных переменных

In [None]:
num_cols

In [None]:
imp_num = pd.Series(f_classif(data.query('train == 1')[num_cols], \
                              data.query('train == 1')['default'])[0], index = num_cols)
imp_num.sort_values(inplace = True)
plt.subplots(figsize=(8, 8))
imp_num.plot(kind = 'barh')

In [None]:
imp_num.sort_values(ascending=False)[:9].index

##  Категориальные переменные

In [None]:
label_encoder = LabelEncoder()

mapped_education = pd.Series(label_encoder.fit_transform(data['sex']))
print(dict(enumerate(label_encoder.classes_)))

In [None]:
# Для бинарных признаков мы будем использовать LabelEncoder

label_encoder = LabelEncoder()

for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])
    
# убедимся в преобразовании    
data.head()

In [None]:
imp_cat = pd.Series(mutual_info_classif(data.query('train == 1')[bin_cols + cat_cols], \
                                        data.query('train == 1')['default'], discrete_features = True), \
                    index = bin_cols + cat_cols)
imp_cat.sort_values(inplace = True)

plt.subplots(figsize=(8, 8))
imp_cat.plot(kind = 'barh')

## Подготовка данных к машинному обучению

In [None]:
X_cat = OneHotEncoder(sparse = False).fit_transform(data.query('train == 1')[cat_cols].values)
X_cat_test = OneHotEncoder(sparse = False).fit_transform(data.query('train == 0')[cat_cols].values)
X_cat.shape

In [None]:
# Стандартизация числовых переменных

# возможно нужно сделать fit на всех данных а потом transform

X_num = StandardScaler().fit_transform(data.query('train == 1')[num_cols].values)
X_num_test = StandardScaler().fit_transform(data.query('train == 0')[num_cols].values)
X_num.shape

In [None]:
# Объединяем

X = np.hstack([X_num, data.query('train == 1')[bin_cols].values, X_cat])
Y = data.query('train == 1')['default'].values

X_valid = np.hstack([X_num_test, data.query('train == 0')[bin_cols].values, X_cat_test])

X.shape, X_valid.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

In [None]:
model = LogisticRegression(random_state=42).fit(X_train, y_train)
y_pred = model.predict(X_test)

## Оценка качества модели

In [None]:
# model = LogisticRegression()
# model.fit(X_train, y_train)

probs = model.predict_proba(X_test)
probs = probs[:,1]


fpr, tpr, threshold = roc_curve(y_test, probs)
roc_auc = roc_auc_score(y_test, probs)

print('Logistic Regression ROC AUC = %0.3f' % roc_auc)

In [None]:
plt.figure()
plt.subplots(figsize=(8, 8))
plt.plot([0, 1], label='Baseline', linestyle='--')
plt.plot(fpr, tpr, label = 'Regression')
plt.title('Logistic Regression ROC AUC = %0.5f' % roc_auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()

## Гиперпараметры

In [None]:

# Добавим типы регуляризации
penalty = ['l1', 'l2']

# Зададим ограничения для параметра регуляризации
C = np.logspace(0, 4, 10)

# Создадим гиперпараметры
hyperparameters = dict(C=C, penalty=penalty)

iter_count = 300

model = LogisticRegression(random_state=42, max_iter=iter_count)
model.fit(X_train, y_train)

# Создаем сетку поиска с использованием 5-кратной перекрестной проверки
clf = GridSearchCV(model, hyperparameters, cv=5, verbose=0)

best_model = clf.fit(X_train, y_train)


probs = best_model.predict_proba(X_test)
probs = probs[:,1]
roc_auc = roc_auc_score(y_test, probs)


### ROC AUC

In [None]:
# View best hyperparameters

best_penalty = best_model.best_estimator_.get_params()['penalty']
best_c = best_model.best_estimator_.get_params()['C']

print('Лучшее Penalty:', best_penalty, '\nMax iter:', iter_count)
print('Лучшее C:', best_c)
print('Logistic Regression ROC AUC = %0.5f' % roc_auc)

In [None]:

model = LogisticRegression(penalty=best_penalty, C=best_c, random_state=42, max_iter=iter_count)
# model = LogisticRegression(penalty=best_penalty, random_state=42, max_iter=iter_count)
model.fit(X, Y)

probs = model.predict_proba(X_valid)
probs = probs[:,1]


In [None]:
len(probs), X_valid.shape

## Выгрузка файла `submission`

In [None]:
client_id_test = data.query('train == 0')['client_id']
# client_id_test

In [None]:
my_submission = pd.DataFrame({'client_id': client_id_test, 'default': probs})
my_submission.to_csv('submission.csv', index=False)

#  Submission must have 36349 rows
my_submission.shape

In [None]:
if my_submission.shape[0] - 36349 != 0:
    print('='*54)
    print('\t ERROR'*5)
    print('='*54)
else:
    display(my_submission.head(7).append(my_submission.sample(7)))

# END