# TRAINING

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import pickle

In [3]:
df = pd.read_csv('train.csv', sep=',', index_col=['id'])

va = {'> 2 Years': 2, '1-2 Year': 1, '< 1 Year': 0}
gen = {'Male' : 0, 'Female' : 1}
vg = {'Yes' : 1, 'No' : 0}
df['Vehicle_Age'] = df['Vehicle_Age'].map(va)
df['Gender'] = df['Gender'].map(gen)
df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vg)

num_feat = ['Age', 'Vintage']

cat_feat = [
    'Gender', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage',
    'Driving_License', 'Policy_Sales_Channel', 'Region_Code'
]



In [4]:
scl = MinMaxScaler()

num_scl = pd.DataFrame(scl.fit_transform(df[num_feat]))
num_scl.index = df[num_feat].index
num_scl.columns = df[num_feat].columns
X_ = pd.concat([num_scl, df[cat_feat]], axis=1)
X_.head()

Unnamed: 0_level_0,Age,Vintage,Gender,Previously_Insured,Vehicle_Age,Vehicle_Damage,Driving_License,Policy_Sales_Channel,Region_Code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.369231,0.716263,0,0,2,1,1,26.0,28.0
2,0.861538,0.598616,0,0,1,0,1,26.0,3.0
3,0.415385,0.058824,0,0,2,1,1,26.0,28.0
4,0.015385,0.66782,0,1,0,0,1,152.0,11.0
5,0.138462,0.100346,1,1,0,0,1,152.0,41.0


In [5]:
y = df.Response
X_.shape, y.shape

((381109, 9), (381109,))

In [6]:
params = {
    'reg_lambda': 1.8,
    'reg_alpha': 0.9,
    'num_leaves': 80,
    'min_child_weight': 1,
    'max_depth': 6,
    'learning_rate': 0.12,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'objective': 'binary',
    "boosting_type": "gbdt",
    "bagging_seed": 23,
    "metric": 'auc',
    "verbosity": -1,
    'num_iterations' : 90
}

In [7]:
n_folds = 5
fold = KFold(shuffle=True)
splits = fold.split(X_, y)
columns = X_.columns
oof = np.zeros(X_.shape[0])
score = 0
y_oof = np.zeros(X_.shape[0])
feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

In [8]:
#обучение лаитгбм на фолдах. Обратите внимания на параметры lgb.train, lgb.Dataset

for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X_[columns].iloc[train_index], X_[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label = y_train)
    dvalid = lgb.Dataset(X_valid, label = y_valid)
    
    clf = lgb.train(params, dtrain, 80, valid_sets=[dtrain, dvalid], 
                    verbose_eval=100)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    score += roc_auc_score(y_valid, y_pred_valid) / n_folds
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

Fold 1 | AUC: 0.8578106419717229
Fold 2 | AUC: 0.8588193518587356
Fold 3 | AUC: 0.8555661883073996
Fold 4 | AUC: 0.8580452057000232
Fold 5 | AUC: 0.8581888861949065

Mean AUC = 0.8576860548065576
Out of folds AUC = 0.8576623133465945


In [9]:
pickle_out = open('clf.pkl', 'wb')
pickle.dump(clf, pickle_out)
pickle_out.close()