In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from pandas import Series
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_auc_score, roc_curve

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
DATA_DIR = '/kaggle/input/sf-scoring/'
train = pd.read_csv(DATA_DIR +'/train.csv')
test = pd.read_csv(DATA_DIR +'/test.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

**Предварительная обработка**

In [5]:
n_default = len(train[train.default==1])
n_no_default = len(train) - n_default
data_names = ['был дефолт', 'не было дефолта']
data_values = np.array([n_default, n_no_default])
dpi =100
fig = plt.figure(dpi = dpi, figsize = (1024 / dpi, (384 * 2) / dpi) )
mpl.rcParams.update({'font.size': 10})

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:d})'.format(p=pct,v=val)
    return my_autopct

plt.title('Распределение дефолтов')
plt.pie(data_values, labels = data_names, autopct= make_autopct(data_values))

In [6]:
train.head()

Получим информацию о датасетах


In [7]:
train.info()

In [8]:
test.info()

Видно, что пропуски есть только у education

In [9]:
train.isnull().sum() 

In [10]:
test.isnull().sum()

In [11]:
train['education'].value_counts().plot.barh()

In [12]:
# пустые значения заменим на наиболее распространённое
train.education.fillna('SCH', inplace=True)
test.education.fillna('SCH', inplace=True)

In [13]:
train.education.isnull().sum()

In [14]:
# преобразуем даты в числа
from datetime import datetime 
train.app_date=train.app_date.apply(lambda x: (datetime.strptime(x, '%d%b%Y') - datetime(2000, 1, 1)).total_seconds()/ (24*3600))
test.app_date=test.app_date.apply(lambda x: (datetime.strptime(x, '%d%b%Y') - datetime(2000, 1, 1)).total_seconds()/ (24*3600))
train.head()

In [15]:
# рассортируем признаки по типам

#num_cols = ['age', 'decline_app_cnt', 'score_bki','bki_request_cnt', 'income']
num_cols = ['app_date','age', 'decline_app_cnt', 'score_bki','bki_request_cnt', 'income', 'region_rating']
bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']

#cat_cols = ['education', 'first_time', 'sna', 'work_address', 'home_address', 'region_rating']
cat_cols = ['education', 'first_time', 'sna', 'work_address', 'home_address']

**Числовые признаки**

In [16]:
for i in num_cols:
    plt.figure()   
    sns.distplot(train[i], kde = False, rug=False)
    plt.title(i)
    plt.show()

большие хвосты -  прологарифмируем (кроме score_bki, region_rating, app_date)

In [17]:
for i in num_cols:
    if i != 'score_bki' and i != 'region_rating' and i != 'app_date':
        train[i] = np.log(train[i] + 1)
        test[i] = np.log(test[i] + 1)
    
for i in num_cols:
    plt.figure()    
    sns.distplot(train[i], kde = False, rug=False)
    plt.title(i)
    plt.show()   
    

**Одномерные зависимости**

In [18]:
sns.boxplot(x=train.default, y=train.age)

с возрастом клиенты становятся более надёжными

In [19]:
sns.boxplot(x=train.default, y=train.income)

клиенты с большим доходом более надёжные

In [20]:
sns.boxplot(x=train.default, y=train.bki_request_cnt)

кто чаще брал кредиты - те более надёжный

In [21]:
sns.boxplot(x=train.default, y=train.decline_app_cnt)

чем больше количество отказов у заёмщика - тем менее он надёжен

In [22]:
sns.heatmap(train[num_cols].corr().abs(), vmin=0, vmax=1)

числовые переменные имеют слабую корреляцию

In [23]:
imp_num = pd.Series(f_classif(train[num_cols], train['default'])[0], index = num_cols)
imp_num.sort_values(inplace = True)
imp_num.plot(kind = 'barh')

самый значимый из числовых признаков - оценка заёмщика

**бинарные и категориальные признаки**

In [24]:
# закодируем бинарные признаки и образование
label_encoder = LabelEncoder()

for column in bin_cols:
    train[column] = label_encoder.fit_transform(train[column])
    test[column] = label_encoder.fit_transform(test[column])
    

In [25]:
train.head()

In [26]:
train['education'] = label_encoder.fit_transform(train['education'])
test['education'] = label_encoder.fit_transform(test['education'])

In [27]:
train.head()

значимость категориальных и бинарных признаков

In [28]:
imp_cat = Series(mutual_info_classif(train[bin_cols + cat_cols], train['default'],
                                     discrete_features =True), index = bin_cols + cat_cols)
imp_cat.sort_values(inplace = True)
imp_cat.plot(kind = 'barh')

In [29]:
# удалим незначимые признаки, оставим значимые во всех категориях признаков
# num_cols = ['decline_app_cnt', 'score_bki','bki_request_cnt', 'region_rating']
# bin_cols = ['foreign_passport']
# cat_cols = ['education', 'first_time', 'sna', 'home_address']

# в этом случае результат хуже

In [30]:
# Категориальные признаки (кроме образования) закодируем dummy переменными с помошью  OneHotEncoder
X_cat = OneHotEncoder(sparse = False).fit_transform(train[cat_cols].values)
Xv_cat = OneHotEncoder(sparse = False).fit_transform(test[cat_cols].values)

In [31]:
# скалируем числовые данные
X_num = StandardScaler().fit_transform(train[num_cols].values)
Xv_num = StandardScaler().fit_transform(test[num_cols].values)

**Объединение преобразованных данных и моделирование**

In [32]:
#
X = np.hstack([X_num, train[bin_cols].values, X_cat])
y = train['default'].values



In [33]:
test_data = np.hstack([Xv_num, test[bin_cols].values, Xv_cat])

In [34]:
# Воспользуемся специальной функцие train_test_split для разбивки тестовых данных
from sklearn.model_selection import train_test_split

# выделим 20% данных на валидацию (параметр test_size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
from sklearn.linear_model import LogisticRegression # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [36]:
logreg = LogisticRegression(solver='liblinear', max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [37]:
from sklearn.metrics import classification_report
classification_report = classification_report(y_test, y_pred)
print(classification_report)

In [38]:
proba = logreg.predict_proba(X_test)
proba = proba[:,1]

fpr, tpr, threshold = roc_curve(y_test, proba)
roc_auc = roc_auc_score(y_test, proba)

plt.figure()
plt.plot([0, 1], label='Baseline', linestyle='--')
plt.plot(fpr, tpr, label = 'Regression')
plt.title('Logistic Regression ROC AUC = %0.3f' % roc_auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression()

iter_ = 300
epsilon_stop = 1e-3

param_grid = [
    {
        'penalty': ['l1'],
        'solver': ['liblinear', 'saga'],
        'class_weight':['none', 'balanced'], 
        'multi_class': ['auto','ovr'],
        'C': [ 0.01, 0.1, 0.5, 1, 2, 10, 100],
        'max_iter':[iter_],
        'tol':[epsilon_stop]
    },
    {
        'penalty': ['l2'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'class_weight':['none', 'balanced'], 
        'multi_class': ['auto','ovr'],
        'C': [ 0.01, 0.1, 0.5, 1, 2, 10, 100],
        'max_iter':[iter_],
        'tol':[epsilon_stop]
    },
    {
        'penalty': ['none'],
        'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
        'class_weight':['none', 'balanced'], 
        'multi_class': ['auto','ovr'],
        'max_iter':[iter_],
        'tol':[epsilon_stop]
    },
    {
        'penalty': ['elasticnet'],
        'solver': ['saga'],
        'class_weight':['none', 'balanced'], 
        'multi_class': ['auto','ovr'],
        'C': [ 0.01, 0.1, 0.5, 1, 2, 10, 100],
        'max_iter':[iter_],
        'tol':[epsilon_stop]

    },
]

gridsearch = GridSearchCV(model, param_grid, scoring='f1', n_jobs=-1, cv=5)

gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_



In [40]:
##печатаем параметры
best_parameters = model.get_params()
for param_name in sorted(best_parameters.keys()):
        print('\t%s: %r' % (param_name, best_parameters[param_name]))

In [41]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report
classification_report = classification_report(y_test, y_pred)
print(classification_report)

***после поиска лучших гиперпараметров результат стал лучше!***

**финальный результат**

In [42]:
logreg_final = model
logreg_final.fit(X, y)

In [43]:
predict_submission = logreg_final.predict(test_data)

In [None]:
sample_submission['default'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)