In [67]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import catboost
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

In [326]:
data = pd.read_csv('data.csv')
model = pd.read_csv('model.csv')
data = data.merge(model, on='incident')

In [327]:
data['y'] = data.APPLICATION_MONTH.apply(lambda x: int(str(x)[:4]))
data['m'] = data.APPLICATION_MONTH.apply(lambda x: int(str(x)[5:]))

In [329]:
train = data.iloc[:81617, :]
test = data.iloc[81617:, :]

In [330]:
temp = pd.read_excel('drop_cols.xlsx')
drop_cols = temp.f.values

### LightGBM

In [332]:
folds = KFold(n_splits = 5, shuffle=True, random_state=235)

In [333]:
param = {
         "boosting": "gbdt",
         'objective':'regression',
         'learning_rate': 0.1,
         "metric": 'mae',
         #'max_depth': 7,
         "verbosity": -1,
         #"lambda_l1": 0.7,
         "bagging_seed": 11,
}

In [334]:
mvalid = np.zeros(len(train))
mfull  = np.zeros(len(test))
y = train['PD']

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.drop(drop_cols, axis=1).values, train.drop(drop_cols, axis=1).values)):
    print('----')
    print("fold n°{}".format(fold_))
    
    x0, y0 = train.drop(drop_cols, axis=1).iloc[trn_idx], y[trn_idx]
    x1, y1 = train.drop(drop_cols, axis=1).iloc[val_idx], y[val_idx]
    
    trn_data = lgb.Dataset(x0, label= y0); val_data = lgb.Dataset(x1, label= y1)
    
    num_round = 2500
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], 
                    verbose_eval=100, early_stopping_rounds = 250)
    mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)
    
    mfull += clf.predict(test.drop(drop_cols, axis=1), num_iteration=clf.best_iteration) / folds.n_splits
    
mean_absolute_error(mvalid, y)

----
fold n°0
Training until validation scores don't improve for 250 rounds
[100]	training's l1: 0.0243842	valid_1's l1: 0.0274686
[200]	training's l1: 0.0225486	valid_1's l1: 0.0273326
[300]	training's l1: 0.0212734	valid_1's l1: 0.0274529
[400]	training's l1: 0.020276	valid_1's l1: 0.0275817
Early stopping, best iteration is:
[177]	training's l1: 0.0228886	valid_1's l1: 0.0273171
----
fold n°1
Training until validation scores don't improve for 250 rounds
[100]	training's l1: 0.0245503	valid_1's l1: 0.0277597
[200]	training's l1: 0.0226309	valid_1's l1: 0.0276102
[300]	training's l1: 0.0213102	valid_1's l1: 0.0276828
[400]	training's l1: 0.0202912	valid_1's l1: 0.0277801
Early stopping, best iteration is:
[191]	training's l1: 0.0227613	valid_1's l1: 0.0275973
----
fold n°2
Training until validation scores don't improve for 250 rounds
[100]	training's l1: 0.0246701	valid_1's l1: 0.0268242
[200]	training's l1: 0.0227629	valid_1's l1: 0.026729
[300]	training's l1: 0.0215107	valid_1's l1:

0.02732639832739961

In [335]:
#pd.DataFrame(mfull).to_csv('sub38.csv', header=None, index=None)

### Catboost

In [336]:
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)

In [337]:
model = catboost.CatBoostRegressor(
    learning_rate=0.01, 
    depth=8, 
    #l2_leaf_reg = 14, 
    iterations = 9000,
    verbose = False,
    loss_function='MAE',
    task_type="GPU"
)

In [338]:
X = train.drop(drop_cols, axis=1).values
y = train['PD'].values
y_test_pred  = np.zeros(len(test))
for i, (train_index, test_index) in enumerate(kf.split(train.drop(drop_cols, axis=1))):
    y_train, y_valid = y[train_index], y[test_index]
    X_train, X_valid = X[train_index,:], X[test_index,:]
    print( "\nFold ", i)
    
    fit_model = model.fit( X_train, y_train, 
                           eval_set=(X_valid, y_valid),
                           use_best_model=True,
                           early_stopping_rounds=300
                         )
    print( "  N trees = ", model.tree_count_ )
        
    pred = fit_model.predict(X_valid)
    print( "  MAE = ", mean_absolute_error(y_valid, pred) )
    y_test_pred += fit_model.predict(test.drop(drop_cols, axis=1))
    
y_test_pred /= K


Fold  0
  N trees =  5329
  MAE =  0.02357395312553761

Fold  1
  N trees =  4242
  MAE =  0.024242401063470972

Fold  2
  N trees =  5098
  MAE =  0.024219867315368283

Fold  3
  N trees =  6632
  MAE =  0.0243295441923367

Fold  4
  N trees =  4206
  MAE =  0.023805178504947


In [306]:
#pd.DataFrame(y_test_pred).to_csv('sub36.csv', header=None, index=None)

In [359]:
pred = 1.065*(0.5*y_test_pred + 0.5*mfull)

In [360]:
pd.DataFrame(pred).to_csv('sub.csv', header=None, index=None)