In [1]:
# !pip install xgboost lightgbm catboost seaborn kaggle

In [2]:
# !pip freeze

In [3]:
# !dir

In [27]:
# !kaggle competitions download -c credit-default-prediction-ai-big-data
# !ls

In [4]:
GLOBAL_RANDOM_STATE = 322 # Привет, Solo

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score, make_scorer, classification_report
from sklearn.model_selection import KFold, cross_val_score, train_test_split, ShuffleSplit, learning_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import xgboost as xgb, lightgbm as lgbm, catboost as catb

# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.simplefilter('ignore')



In [6]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [7]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name], random_state=GLOBAL_RANDOM_STATE)
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1, random_state=GLOBAL_RANDOM_STATE) 

In [8]:
train_data_minmax_scaled = pd.read_csv("course_project_train_FIXED_MINMAX_SCALED.csv")
test_data_minmax_scaled = pd.read_csv("course_project_test_FIXED_MINMAX_SCALED.csv")

In [9]:
train_data_std_scaled = pd.read_csv("course_project_train_FIXED_STD_SCALED.csv")
test_data_std_scaled = pd.read_csv("course_project_test_FIXED_STD_SCALED.csv")

In [10]:
FEATURE_NAMES_SELECTED = train_data_minmax_scaled.columns.drop(["Credit Default", "Id"]).tolist()
TARGET_NAME = "Credit Default"
FEATURES_TO_NORMALIZE = joblib.load("scaler.columns")

In [11]:
minmax_scaler = joblib.load("minmax_scaler.save")
std_scaler = joblib.load("std_scaler.save")

In [12]:
train_data = train_data_minmax_scaled.copy()
train_data[FEATURES_TO_NORMALIZE] = minmax_scaler.inverse_transform(train_data_minmax_scaled[FEATURES_TO_NORMALIZE])

test_data = test_data_minmax_scaled.copy()
test_data[FEATURES_TO_NORMALIZE] = minmax_scaler.inverse_transform(test_data_minmax_scaled[FEATURES_TO_NORMALIZE])

In [13]:
data_split_shuffle = True
data_split_test_size = 0.25
train_data_X, val_data_X, train_data_y, val_data_y = train_test_split(train_data[FEATURE_NAMES_SELECTED], 
                                                                      train_data[TARGET_NAME], 
                                                                      shuffle=data_split_shuffle, 
                                                                      test_size=data_split_test_size, 
                                                                      random_state=GLOBAL_RANDOM_STATE)
train_data_X_minmax_scaled, val_data_X_minmax_scaled, train_data_y_minmax_scaled, val_data_y_minmax_scaled = train_test_split(train_data_minmax_scaled[FEATURE_NAMES_SELECTED], 
                                                                                                                              train_data_minmax_scaled[TARGET_NAME], 
                                                                                                                              shuffle=data_split_shuffle, 
                                                                                                                              test_size=data_split_test_size, 
                                                                                                                              random_state=GLOBAL_RANDOM_STATE)
train_data_X_std_scaled, val_data_X_std_scaled, train_data_y_std_scaled, val_data_y_std_scaled = train_test_split(train_data_std_scaled[FEATURE_NAMES_SELECTED], 
                                                                                                                  train_data_std_scaled[TARGET_NAME], 
                                                                                                                  shuffle=data_split_shuffle, 
                                                                                                                  test_size=data_split_test_size, 
                                                                                                                  random_state=GLOBAL_RANDOM_STATE)

In [14]:
tmp_df_balanced = balance_df_by_target(pd.concat([train_data_X, train_data_y], axis=1), TARGET_NAME)
train_data_X_balanced = tmp_df_balanced.drop(columns=[TARGET_NAME])
train_data_y_balanced = tmp_df_balanced[TARGET_NAME]

In [15]:
tmp_df_balanced = balance_df_by_target(pd.concat([train_data_X_minmax_scaled, train_data_y_minmax_scaled], axis=1), TARGET_NAME)
train_data_X_minmax_scaled_balanced = tmp_df_balanced.drop(columns=TARGET_NAME)
train_data_y_minmax_scaled_balanced = tmp_df_balanced[TARGET_NAME]

In [16]:
tmp_df_balanced = balance_df_by_target(pd.concat([train_data_X_std_scaled, train_data_y_std_scaled], axis=1), TARGET_NAME)
train_data_X_std_scaled_balanced = tmp_df_balanced.drop(columns=TARGET_NAME)
train_data_y_std_scaled_balanced = tmp_df_balanced[TARGET_NAME]

In [17]:
# Attempt 1
parameters = {
    "border_count": 32,
    "depth": 3,
    "learning_rate": 0.5,
    "l2_leaf_reg": 1,
    "iterations": 250
}

# Attempt 2
parameters = {
    "border_count": 32,
    "depth": 3,
    "learning_rate": 0.5,
    "l2_leaf_reg": 2,
    "iterations": 1000
}

# Attempt 3
parameters = {
    "border_count": 32,
    "depth": 6,
    "learning_rate": 0.2,
    "l2_leaf_reg": 300,
    "iterations": 500
}

# Attempt 4
parameters = {
    "border_count": 32,
    "depth": 6,
    "learning_rate": 0.3,
    "l2_leaf_reg": 300,
    "iterations": 1000
}

# Attempt 5
parameters = {
    "border_count": 32,
    "depth": 6,
    "learning_rate": 0.001,
    "l2_leaf_reg": 300,
    "iterations": 10000
}

In [18]:
X_train = train_data_X_std_scaled_balanced
y_train = train_data_y_std_scaled_balanced
X_test = val_data_X_std_scaled
y_test = val_data_y_std_scaled

In [19]:
gpu_model_std_scaled = catb.CatBoostClassifier(silent=True, 
                                    random_state=GLOBAL_RANDOM_STATE,
                                    task_type="GPU",
                                    devices='0',
                                    **parameters,
                                    use_best_model=True,
                                    eval_metric="F1"
                                   )
gpu_model_std_scaled.fit(X_train, y_train, eval_set=(X_test, y_test))
y_train_pred = gpu_model_std_scaled.predict(X_train)
y_test_pred = gpu_model_std_scaled.predict(X_test)
get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.72      0.84      0.78      4045
           1       0.74      0.58      0.65      3160

    accuracy                           0.73      7205
   macro avg       0.73      0.71      0.71      7205
weighted avg       0.73      0.73      0.72      7205

TEST

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      1342
           1       0.56      0.54      0.55       533

    accuracy                           0.75      1875
   macro avg       0.69      0.68      0.69      1875
weighted avg       0.74      0.75      0.75      1875

CONFUSION MATRIX

col_0            0.0  1.0
Credit Default           
0               1113  229
1                246  287


In [20]:
y_pred_gpu_std_scaled = gpu_model_std_scaled.predict(test_data_std_scaled)

In [21]:
X_train = train_data_X_minmax_scaled_balanced
y_train = train_data_y_minmax_scaled_balanced
X_test = val_data_X_minmax_scaled
y_test = val_data_y_minmax_scaled

In [22]:
gpu_model_minmax_scaled = catb.CatBoostClassifier(silent=True, 
                                    random_state=GLOBAL_RANDOM_STATE,
                                    task_type="GPU",
                                    devices='0',
                                    **parameters,
                                    use_best_model=True,
                                    eval_metric="F1"
                                   )
gpu_model_minmax_scaled.fit(X_train, y_train, eval_set=(X_test, y_test))
y_train_pred = gpu_model_minmax_scaled.predict(X_train)
y_test_pred = gpu_model_minmax_scaled.predict(X_test)
get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.72      0.84      0.78      4045
           1       0.74      0.58      0.65      3160

    accuracy                           0.73      7205
   macro avg       0.73      0.71      0.71      7205
weighted avg       0.73      0.73      0.72      7205

TEST

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      1342
           1       0.56      0.54      0.55       533

    accuracy                           0.75      1875
   macro avg       0.69      0.68      0.69      1875
weighted avg       0.74      0.75      0.75      1875

CONFUSION MATRIX

col_0            0.0  1.0
Credit Default           
0               1113  229
1                246  287


In [23]:
y_pred_gpu_minmax_scaled = gpu_model_minmax_scaled.predict(test_data_std_scaled)

In [24]:
np.array_equal(y_pred_gpu_std_scaled, y_pred_gpu_minmax_scaled)

False

In [25]:
!head /tf/notebooks/GeekBrains_AI_Python_for_data_science_2/course_project/credit-default-prediction-ai-big-data/sampleSubmission.csv

Id,Credit Default
7500,0
7501,0
7502,0
7503,0
7504,0
7505,0
7506,0
7507,0
7508,0


In [26]:
test_data_minmax_scaled['Credit Default'] = y_pred_gpu_std_scaled.astype(int)
test_data_minmax_scaled

Unnamed: 0,Id,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Bankruptcies,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,HomeOwnership_Home Mortgage,HomeOwnership_Rent,was_credit_problem,delinquent_0.5,delinquent_1,delinquent_2,delinquent_3,delinquent_5,delinquent,was_bankrupt,Purpose_business loan,Purpose_buy a car,Purpose_debt consolidation,Purpose_home improvements,Purpose_other,very_bad_credit_score,Credit Default
0,7500,0.100532,0.4,0.0,0.250000,0.184783,0.165792,0.0,0.0,0,0.194433,0.052944,0.085163,0.351807,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
1,7501,0.006734,0.1,0.0,0.142857,0.623913,0.031479,0.0,0.0,0,0.086214,0.023009,0.028975,0.274699,0,1,0,0,1,1,1,1,1,0,0,0,0,0,1,0,1
2,7502,0.098945,0.3,0.0,0.285714,0.210870,0.152470,0.0,0.0,0,0.242915,0.073236,0.234113,1.000000,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1
3,7503,0.105733,1.0,0.0,0.500000,0.282609,0.357334,0.0,0.0,0,0.265034,0.106591,0.344488,0.371084,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,7504,0.217968,0.6,0.0,0.321429,0.426087,0.968557,0.0,0.0,1,0.985348,0.212688,0.532563,0.291566,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,9995,0.085676,1.0,0.0,0.428571,0.545652,0.441044,1.0,1.0,0,0.327318,0.081359,0.188075,0.385542,1,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0
2496,9996,0.100532,0.2,0.0,0.464286,0.282609,1.000000,0.0,0.0,0,0.587911,0.719632,0.412450,0.351807,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
2497,9997,0.100875,0.2,0.0,1.000000,0.191304,1.000000,0.0,0.0,0,0.539034,0.338213,0.461400,0.265060,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2498,9998,0.055978,1.0,0.0,0.428571,0.539130,0.755726,0.0,0.0,0,0.316966,0.195616,0.168825,0.383133,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [28]:
test_data_minmax_scaled.loc[:, ['Id', 'Credit Default']]

Unnamed: 0,Id,Credit Default
0,7500,0
1,7501,1
2,7502,1
3,7503,0
4,7504,0
...,...,...
2495,9995,0
2496,9996,0
2497,9997,0
2498,9998,0


In [29]:
test_data_minmax_scaled.loc[:, ['Id', 'Credit Default']].to_csv('NPomazan_predictions.csv', index=None)

In [32]:
# !kaggle competitions submit -c credit-default-prediction-ai-big-data -m "Submission from Jupyter Notebook" -f NPomazan_predictions.csv

100%|██████████████████████████████████████| 17.1k/17.1k [00:08<00:00, 2.08kB/s]
Successfully submitted to Loan Default Prediction