In [309]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime # работа с датами
import matplotlib.pyplot as plt # графики
import seaborn as sns
%matplotlib inline

from sklearn.feature_selection import f_classif, mutual_info_classif # оценка значимости признаков
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder # преобразование признаков
from sklearn.linear_model import LogisticRegression # логистическая регрессия
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV # разделение данных и подбор гиперпараметров
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score # метрики

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [310]:
DATA_DIR = '/kaggle/input/sf-scoring/'
df_train = pd.read_csv(DATA_DIR +'/train.csv')
df_test = pd.read_csv(DATA_DIR +'/test.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

In [311]:
# ВАЖНО! для корректной обработки признаков объединяем трейн и тест в один датасет
df_train['sample'] = 1 # помечаем где у нас трейн
df_test['sample'] = 0  # помечаем где у нас тест
df_test['default'] = 0 # в тесте у нас нет значения default, мы его должны предсказать, по этому пока просто заполняем нулями

data = df_test.append(df_train, sort=False).reset_index(drop=True) # объединяем

In [72]:
# НУЖНО БОЛЬШЕ КОЛОНОК
pd.set_option('display.max_columns', None)

Рассмотрим наши данные повнимательнее

In [312]:
data.info()

In [7]:
data.head()

In [29]:
data.nunique(dropna=False)

In [39]:
data.isna().sum()

In [109]:
# сохраним изначальную версию датасета на всякий случай
data_back = data.copy()

**Начинаем анализ данных**

In [8]:
# что там с датой?
data.app_date.head()

In [12]:
# дату надо распарсить и исследовать:
data['app_date'] = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))

In [13]:
# какие годы есть в выборке?
data.app_date.dt.year.value_counts()

In [14]:
# какие месяцы?
data.app_date.dt.month.value_counts()

In [15]:
# ок, год можно опустить, выделим из даты колонки "месяц", "день недели", "день месяца по порядку", "день года по порядку"
data['day_of_week'] = data.app_date.dt.day_of_week
data['day_of_month'] = data.app_date.dt.day
data['day_of_year'] = data.app_date.dt.day_of_year
data['month'] = data.app_date.dt.month

In [16]:
# education:
data.education.value_counts()

In [40]:
data.education.isna().sum()

Видим 478 пропусков. Позже подумаем, как заполнить пропуски

In [18]:
data.sex.value_counts()

Тут всё нормально

In [19]:
data.age.hist()

In [20]:
# а если логарифмировать?
data.age.apply(lambda x: np.log(x)).hist()

Так больше похоже на нормальное распределение

In [21]:
data.car.value_counts()

In [22]:
data.car_type.value_counts()

In [23]:
data.decline_app_cnt.value_counts()

На мой взгляд, здесь нет выбросов и неверных данных. По межквартильным расстояниям можно, конечно, удалить некоторые данные, потом попробуем разные подходы.

In [25]:
data.good_work.value_counts()

In [26]:
data.score_bki.hist()

In [27]:
data.bki_request_cnt.value_counts()

Тоже попробуем разные подходы

In [28]:
data.region_rating.value_counts()

Можно подойти к этой переменной как к числовой или как к категориальной.

In [30]:
data.home_address.value_counts()

In [31]:
data.work_address.value_counts()

In [32]:
data.income.hist()

In [33]:
data.income.apply(lambda x: np.log(x)).hist()

Так намного лучше

In [34]:
data.sna.value_counts()

In [35]:
data.first_time.value_counts()

In [37]:
data.foreign_passport.value_counts()

In [42]:
# данные по целевой переменной надо смотреть в обучающей выборке:
df_train.default.value_counts()

Видна диспропорция, делить надо будет с учётом этого

In [44]:
data.day_of_week.value_counts()

In [48]:
data.day_of_month.hist()

In [47]:
data.day_of_year.hist()

In [49]:
data.month.value_counts()

Итого, в плане предобработки данных какие у меня есть сомнения:
- как заполнить пропуски в education? может быть отбросить лишнее?
- какие конкретно данные брать из даты подачи заявки?
- делать что-то с "выбросами" в decline_app_cnt и bki_request_cnt?

**Посмотрим корреляции:**

In [53]:
# корреляция числовых переменных
num_cols = ['age', 'decline_app_cnt', 'score_bki', 'bki_request_cnt', 'income', 'day_of_month', 'day_of_year']
sns.heatmap(data[num_cols].corr().abs(), vmin=0, vmax=1)

In [54]:
data[num_cols].corr().abs()

Есть корреляции, но не настолько сильные, чтобы отбросить какие-то колонки

In [58]:
df_train['app_date'] = df_train.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))
df_train['day_of_week'] = df_train.app_date.dt.day_of_week
df_train['day_of_month'] = df_train.app_date.dt.day
df_train['day_of_year'] = df_train.app_date.dt.day_of_year
df_train['month'] = df_train.app_date.dt.month

imp_num = pd.Series(f_classif(df_train[num_cols], df_train['default'])[0], index = num_cols)
imp_num.sort_values(inplace = True)
imp_num.plot(kind = 'barh')

Кандидаты на вылет:
- day_of_month, age
- day_of_year, income
- bki_request_cnt

In [78]:
# education пока заполним самым частым значением
# пока что закодируем этот признак числами, как бинарный, потом мы будем обрабатывать его как категориальный
df_train.education.fillna('SCH', inplace=True)
label_encoder = LabelEncoder()
df_train.education = label_encoder.fit_transform(df_train.education)

bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']
# бинарные переменные тоже кодируем
for column in bin_cols:
    df_train[column] = label_encoder.fit_transform(df_train[column])
# и рейтинг региона тоже, чтобы масштаб у всех был единый (не уверен, вроде бы это не важно в аднном случае)
df_train.region_rating = label_encoder.fit_transform(df_train.region_rating)

cat_cols = ['education', 'region_rating', 'home_address', 'work_address', 'sna', 'first_time', 'day_of_week', 'month']

In [79]:
df_train.head()

In [80]:
imp_cat = pd.Series(mutual_info_classif(df_train[bin_cols + cat_cols], df_train['default'],
                                     discrete_features =True), index = bin_cols + cat_cols)
imp_cat.sort_values(inplace = True)
imp_cat.plot(kind = 'barh')

Кандидаты на вылет:
- day_of_week, sex
- month, car, good_work
- car_type, foreign_passport

Теперь я хочу заняться выбором итогового метода обработки данных.
С чем можно поиграть:
- заполнение пропусков в education
- удаление "выбросов" в decline_app_cnt и bki_request_cnt
- отбрасывание признаков

**Обработка данных**

**Версия 1:**
- образование заполняем самым частым значением
- ничего не отбрасываем

In [157]:
data = data_back.copy()

data['app_date'] = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))
data['day_of_week'] = data.app_date.dt.day_of_week
data['day_of_month'] = data.app_date.dt.day
data['day_of_year'] = data.app_date.dt.day_of_year
data['month'] = data.app_date.dt.month

data.drop(['client_id','app_date',], axis = 1, inplace=True)

data.education.fillna('SCH', inplace=True)

bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']
cat_cols = ['education', 'region_rating', 'home_address', 'work_address', 'sna', 'first_time', 'day_of_week', 'month']
num_cols = ['age', 'decline_app_cnt', 'score_bki', 'bki_request_cnt', 'income', 'day_of_month', 'day_of_year']

label_encoder = LabelEncoder()
for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])

data = pd.get_dummies(data, columns=cat_cols)

for column in ['age', 'income']:
    data[column] = data[column].apply(lambda x: np.log(x))
    
for column in num_cols:
    mean = data[column].mean()
    std = data[column].std()
    data[column] = data[column].apply(lambda x: (x-mean)/std)
    
data.head()

Проверять выбранные методы обработки будем на одной модели. Параметры модели были подобраны заранее, пока будем использовать их.

In [158]:
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

train_data['new_default'] = train_data['default']
train_data.drop('default', axis=1, inplace=True)
train_data.rename(columns={"new_default": "default"}, inplace=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
train_indices, valid_indices = [split for split in sss.split(train_data.iloc[:, :49], train_data.iloc[:, 49])][0]
train = train_data.iloc[train_indices]
val = train_data.iloc[valid_indices]

y_train = train['default'].values
X_train = train.drop(['default'], axis=1)

y_val = val['default'].values
X_val = val.drop(['default'], axis=1)


model = LogisticRegression(penalty='l2', C=0.001, class_weight='balanced', dual=False, fit_intercept=True,
                               intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None,
                               random_state=None, solver='newton-cg', tol=1e-05, verbose=0, warm_start=False)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

matrix = confusion_matrix(y_val, y_pred)
creport = classification_report(y_val, y_pred)
ras = roc_auc_score(y_val, y_pred)
print(creport)
print(ras)

ок, попробуем другой подход:
- изменим только заполнение education - не самым частым, а последним перед пропуском

In [182]:
data = data_back.copy()

data['app_date'] = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))
data['day_of_week'] = data.app_date.dt.day_of_week
data['day_of_month'] = data.app_date.dt.day
data['day_of_year'] = data.app_date.dt.day_of_year
data['month'] = data.app_date.dt.month

data.drop(['client_id','app_date',], axis = 1, inplace=True)

data.education.fillna(method='bfill', inplace=True)

bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']
cat_cols = ['education', 'region_rating', 'home_address', 'work_address', 'sna', 'first_time', 'day_of_week', 'month']
num_cols = ['age', 'decline_app_cnt', 'score_bki', 'bki_request_cnt', 'income', 'day_of_month', 'day_of_year']

label_encoder = LabelEncoder()
for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])

data = pd.get_dummies(data, columns=cat_cols)

for column in ['age', 'income']:
    data[column] = data[column].apply(lambda x: np.log(x))
    
for column in num_cols:
    mean = data[column].mean()
    std = data[column].std()
    data[column] = data[column].apply(lambda x: (x-mean)/std)
    
data.head()

In [183]:
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

train_data['new_default'] = train_data['default']
train_data.drop('default', axis=1, inplace=True)
train_data.rename(columns={"new_default": "default"}, inplace=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
train_indices, valid_indices = [split for split in sss.split(train_data.iloc[:, :49], train_data.iloc[:, 49])][0]
train = train_data.iloc[train_indices]
val = train_data.iloc[valid_indices]

y_train = train['default'].values
X_train = train.drop(['default'], axis=1)

y_val = val['default'].values
X_val = val.drop(['default'], axis=1)


model = LogisticRegression(penalty='l2', C=0.001, class_weight='balanced', dual=False, fit_intercept=True,
                               intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None,
                               random_state=None, solver='newton-cg', tol=1e-05, verbose=0, warm_start=False)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

matrix = confusion_matrix(y_val, y_pred)
creport = classification_report(y_val, y_pred)
ras = roc_auc_score(y_val, y_pred)
print(creport)
print(ras)

Я перепроверил примерно 5 раз. Такое впечатление, что второй вариант чуть-чуть лучше.

Теперь попробуем отбросить "лишние" значения в decline_app_cnt и bki_request_cnt

In [184]:
data = data_back.copy()

data['app_date'] = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))
data['day_of_week'] = data.app_date.dt.day_of_week
data['day_of_month'] = data.app_date.dt.day
data['day_of_year'] = data.app_date.dt.day_of_year
data['month'] = data.app_date.dt.month

data.drop(['client_id','app_date',], axis = 1, inplace=True)
data.drop(data[data.decline_app_cnt > 10].index, inplace=True)
data.drop(data[data.bki_request_cnt > 12].index, inplace=True)

data.education.fillna(method='bfill', inplace=True)

bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']
cat_cols = ['education', 'region_rating', 'home_address', 'work_address', 'sna', 'first_time', 'day_of_week', 'month']
num_cols = ['age', 'decline_app_cnt', 'score_bki', 'bki_request_cnt', 'income', 'day_of_month', 'day_of_year']

label_encoder = LabelEncoder()
for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])

data = pd.get_dummies(data, columns=cat_cols)

for column in ['age', 'income']:
    data[column] = data[column].apply(lambda x: np.log(x))
    
for column in num_cols:
    mean = data[column].mean()
    std = data[column].std()
    data[column] = data[column].apply(lambda x: (x-mean)/std)
    
data.head()

In [185]:
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

train_data['new_default'] = train_data['default']
train_data.drop('default', axis=1, inplace=True)
train_data.rename(columns={"new_default": "default"}, inplace=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
train_indices, valid_indices = [split for split in sss.split(train_data.iloc[:, :49], train_data.iloc[:, 49])][0]
train = train_data.iloc[train_indices]
val = train_data.iloc[valid_indices]

y_train = train['default'].values
X_train = train.drop(['default'], axis=1)

y_val = val['default'].values
X_val = val.drop(['default'], axis=1)


model = LogisticRegression(penalty='l2', C=0.001, class_weight='balanced', dual=False, fit_intercept=True,
                               intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None,
                               random_state=None, solver='newton-cg', tol=1e-05, verbose=0, warm_start=False)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

matrix = confusion_matrix(y_val, y_pred)
creport = classification_report(y_val, y_pred)
ras = roc_auc_score(y_val, y_pred)
print(creport)
print(ras)

Результат стал чуть похуже, не будем отбрасывать значения в этих колонках

Теперь попробуем совсем отбросить признаки в несколько этапов

Сначала day_of_month, age, day_of_week, sex

In [217]:
data = data_back.copy()

data['app_date'] = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))
data['day_of_week'] = data.app_date.dt.day_of_week
data['day_of_month'] = data.app_date.dt.day
data['day_of_year'] = data.app_date.dt.day_of_year
data['month'] = data.app_date.dt.month

data.drop(['client_id','app_date',], axis = 1, inplace=True)
data.drop(['day_of_month', 'age', 'day_of_week', 'sex'], axis=1, inplace=True)

data.education.fillna(method='bfill', inplace=True)

bin_cols = ['car', 'car_type', 'good_work', 'foreign_passport']
cat_cols = ['education', 'region_rating', 'home_address', 'work_address', 'sna', 'first_time', 'month']
num_cols = ['decline_app_cnt', 'score_bki', 'bki_request_cnt', 'income', 'day_of_year']

label_encoder = LabelEncoder()
for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])

data = pd.get_dummies(data, columns=cat_cols)

for column in ['income']:
    data[column] = data[column].apply(lambda x: np.log(x))
    
for column in num_cols:
    mean = data[column].mean()
    std = data[column].std()
    data[column] = data[column].apply(lambda x: (x-mean)/std)
    
data.head()

In [218]:
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

train_data['new_default'] = train_data['default']
train_data.drop('default', axis=1, inplace=True)
train_data.rename(columns={"new_default": "default"}, inplace=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
train_indices, valid_indices = [split for split in sss.split(train_data.iloc[:, :39], train_data.iloc[:, 39])][0]
train = train_data.iloc[train_indices]
val = train_data.iloc[valid_indices]

y_train = train['default'].values
X_train = train.drop(['default'], axis=1)

y_val = val['default'].values
X_val = val.drop(['default'], axis=1)


model = LogisticRegression(penalty='l2', C=0.001, class_weight='balanced', dual=False, fit_intercept=True,
                               intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None,
                               random_state=None, solver='newton-cg', tol=1e-05, verbose=0, warm_start=False)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

matrix = confusion_matrix(y_val, y_pred)
creport = classification_report(y_val, y_pred)
ras = roc_auc_score(y_val, y_pred)
print(creport)
print(ras)

Лучше не стало, но мы продолжим

Отбросим ещё day_of_year, income, month, car, good_work

In [233]:
data = data_back.copy()

data['app_date'] = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))
data['day_of_week'] = data.app_date.dt.day_of_week
data['day_of_month'] = data.app_date.dt.day
data['day_of_year'] = data.app_date.dt.day_of_year
data['month'] = data.app_date.dt.month

data.drop(['client_id','app_date',], axis = 1, inplace=True)
data.drop(['day_of_month', 'age', 'day_of_week', 'sex'], axis=1, inplace=True)
data.drop(['day_of_year', 'income', 'month', 'car', 'good_work'], axis=1, inplace=True)

data.education.fillna(method='bfill', inplace=True)

bin_cols = ['car_type', 'foreign_passport']
cat_cols = ['education', 'region_rating', 'home_address', 'work_address', 'sna', 'first_time']
num_cols = ['decline_app_cnt', 'score_bki', 'bki_request_cnt']

label_encoder = LabelEncoder()
for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])

data = pd.get_dummies(data, columns=cat_cols)
 
for column in num_cols:
    mean = data[column].mean()
    std = data[column].std()
    data[column] = data[column].apply(lambda x: (x-mean)/std)
    
data.head()

In [234]:
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

train_data['new_default'] = train_data['default']
train_data.drop('default', axis=1, inplace=True)
train_data.rename(columns={"new_default": "default"}, inplace=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
train_indices, valid_indices = [split for split in sss.split(train_data.iloc[:, :31], train_data.iloc[:, 31])][0]
train = train_data.iloc[train_indices]
val = train_data.iloc[valid_indices]

y_train = train['default'].values
X_train = train.drop(['default'], axis=1)

y_val = val['default'].values
X_val = val.drop(['default'], axis=1)


model = LogisticRegression(penalty='l2', C=0.001, class_weight='balanced', dual=False, fit_intercept=True,
                               intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None,
                               random_state=None, solver='newton-cg', tol=1e-05, verbose=0, warm_start=False)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

matrix = confusion_matrix(y_val, y_pred)
creport = classification_report(y_val, y_pred)
ras = roc_auc_score(y_val, y_pred)
print(creport)
print(ras)

Стало ещё хуже, но мы в последний раз отбросим признаки

In [251]:
data = data_back.copy()

data['app_date'] = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))
data['day_of_week'] = data.app_date.dt.day_of_week
data['day_of_month'] = data.app_date.dt.day
data['day_of_year'] = data.app_date.dt.day_of_year
data['month'] = data.app_date.dt.month

data.drop(['client_id','app_date',], axis = 1, inplace=True)
data.drop(['day_of_month', 'age', 'day_of_week', 'sex'], axis=1, inplace=True)
data.drop(['day_of_year', 'income', 'month', 'car', 'good_work'], axis=1, inplace=True)
data.drop(['bki_request_cnt', 'car_type', 'foreign_passport'], axis=1, inplace=True)

data.education.fillna(method='bfill', inplace=True)

cat_cols = ['education', 'region_rating', 'home_address', 'work_address', 'sna', 'first_time']
num_cols = ['decline_app_cnt', 'score_bki']

data = pd.get_dummies(data, columns=cat_cols)
 
for column in num_cols:
    mean = data[column].mean()
    std = data[column].std()
    data[column] = data[column].apply(lambda x: (x-mean)/std)
    
data.head()

In [252]:
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

train_data['new_default'] = train_data['default']
train_data.drop('default', axis=1, inplace=True)
train_data.rename(columns={"new_default": "default"}, inplace=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
train_indices, valid_indices = [split for split in sss.split(train_data.iloc[:, :28], train_data.iloc[:, 28])][0]
train = train_data.iloc[train_indices]
val = train_data.iloc[valid_indices]

y_train = train['default'].values
X_train = train.drop(['default'], axis=1)

y_val = val['default'].values
X_val = val.drop(['default'], axis=1)


model = LogisticRegression(penalty='l2', C=0.001, class_weight='balanced', dual=False, fit_intercept=True,
                               intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None,
                               random_state=None, solver='newton-cg', tol=1e-05, verbose=0, warm_start=False)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

matrix = confusion_matrix(y_val, y_pred)
creport = classification_report(y_val, y_pred)
ras = roc_auc_score(y_val, y_pred)
print(creport)
print(ras)

Вывод такой: ничего отбрасывать не будем, education заполним на основании соседних данных

**Выбор модели**

In [313]:
# выбранный выше метод обработки данных
data = data_back.copy()

data['app_date'] = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))
data['day_of_week'] = data.app_date.dt.day_of_week
data['day_of_month'] = data.app_date.dt.day
data['day_of_year'] = data.app_date.dt.day_of_year
data['month'] = data.app_date.dt.month

data.drop(['client_id','app_date',], axis = 1, inplace=True)

data.education.fillna(method='bfill', inplace=True)

bin_cols = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']
cat_cols = ['education', 'region_rating', 'home_address', 'work_address', 'sna', 'first_time', 'day_of_week', 'month']
num_cols = ['age', 'decline_app_cnt', 'score_bki', 'bki_request_cnt', 'income', 'day_of_month', 'day_of_year']

label_encoder = LabelEncoder()
for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])

data = pd.get_dummies(data, columns=cat_cols)

for column in ['age', 'income']:
    data[column] = data[column].apply(lambda x: np.log(x))
    
for column in num_cols:
    mean = data[column].mean()
    std = data[column].std()
    data[column] = data[column].apply(lambda x: (x-mean)/std)
    
data.head()

In [314]:
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)

train_data['new_default'] = train_data['default']
train_data.drop('default', axis=1, inplace=True)
train_data.rename(columns={"new_default": "default"}, inplace=True)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
train_indices, valid_indices = [split for split in sss.split(train_data.iloc[:, :28], train_data.iloc[:, 28])][0]
train = train_data.iloc[train_indices]
val = train_data.iloc[valid_indices]

y_train = train['default'].values
X_train = train.drop(['default'], axis=1)

y_val = val['default'].values
X_val = val.drop(['default'], axis=1)

In [281]:
# сетка для подбора гиперпараметров
param_grid = [
        {'penalty': ['l1'],
         'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
         'solver': ['liblinear'],
         'class_weight': [None, 'balanced'],
         'multi_class': ['auto', 'ovr'],
         'max_iter': [500],
         'tol': [1e-3]},
        {'penalty': ['l2'],
         'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
         'class_weight': [None, 'balanced'],
         'multi_class': ['auto', 'ovr'],
         'max_iter': [500],
         'tol': [1e-3]},
        {'penalty': ['none'],
         'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
         'class_weight': [None, 'balanced'],
         'multi_class': ['auto', 'ovr'],
         'max_iter': [500],
         'tol': [1e-3]},
    ]

In [282]:
# подбор модели с помощью функции GridSearchCV
gridsearch = GridSearchCV(model, param_grid, scoring='f1', n_jobs=-1, cv=5)
gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_

##печатаем параметры
best_parameters = model.get_params()
for param_name in sorted(best_parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

Ок, выбрана модель на основе newton-cg. Проведём подбор ещё раз, на этот раз с другим порогом

In [284]:
param_grid = [
        {'penalty': ['l2'],
         'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
         'solver': ['newton-cg'],
         'class_weight': ['balanced'],
         'multi_class': ['auto'],
         'max_iter': [1000],
         'tol': [1e-5]},
    ]

In [285]:
# подбор модели с помощью функции GridSearchCV
gridsearch = GridSearchCV(model, param_grid, scoring='f1', n_jobs=-1, cv=5)
gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_

##печатаем параметры
best_parameters = model.get_params()
for param_name in sorted(best_parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

In [315]:
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

matrix = confusion_matrix(y_val, y_pred)
creport = classification_report(y_val, y_pred)
ras = roc_auc_score(y_val, y_pred)
print(creport)
print(ras)

In [316]:
y = train_data['default'].values
X = train_data.drop(['default'], axis=1)

In [317]:
# если качество нас устраивает, обучаем финальную модель на всех обучающих данных
model.fit(X, y)

In [318]:
predict_submission = model.predict(test_data)

In [319]:
sample_submission['default'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)

In [320]:
sample_submission.head()

In [3]:
!kaggle competitions submit -c sf-scoring -f ssubmission.csv -m "Aleksandr Sokolkin"
# !kaggle competitions submit your-competition-name -f submission.csv -m 'My submission message'