#### Import useful libraries 

In [11]:
# Load the packages
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import datetime
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from scipy import sparse as ssp

In [12]:
#### Define the evaluation metric. 

In [13]:
# Evaluation metric function
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

#Evaluation error function
def eval_error(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score, True)]

In [14]:
cv_only = True
save_cv = False
full_train = False



In [124]:
# Load datasets
train = pd.read_csv('../../data/train.csv')
test = pd.read_csv('../../data/test.csv')
Y_train = train['target']
train_id = train['id']
test_id = test['id']

In [132]:
n_folds = 5
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=218)

y = train['target'].values
X = train.drop(['id','target'],axis=1)
col_names = X.columns.tolist()
cat_cols = []
ind_cols = []
num_cols = []
for col in col_names:
    if 'cat' not in col and 'calc' not in col:
        num_cols.append(col)

for col in col_names:
    if 'cat' in col and 'count' not in col:
        cat_cols.append(col)

for col in col_names:
    if 'ind' in col:
        ind_cols.append(col)

train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

for col in cat_cols:
    le = LabelEncoder()
    le.fit(train[col])
    train[col], test[col] = le.transform(train[col]), le.transform(test[col])
    

enc = OneHotEncoder()
enc.fit(train[cat_features])
X_train_cat, X_test_cat = enc.transform(train[cat_features]), enc.transform(test[cat_features])


In [133]:
count=0
for col in ind_cols:
    if count==0:
        train['new_ind'] = train[col].astype(str)+'_'
        test['new_ind'] = test[col].astype(str)+'_'
        count+=1
    else:
        train['new_ind'] += train[col].astype(str)+'_'
        test['new_ind'] += test[col].astype(str)+'_'

In [134]:
cat_count_cols = []
for col in cat_cols+['new_ind']:
    d = pd.concat([train[col],test[col]]).value_counts().to_dict()
    train['%s_count'%col] = train[col].apply(lambda x:d.get(x,0))
    test['%s_count'%col] = test[col].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

train_list = [train[num_cols+cat_count_cols].values,X_cat,]
test_list = [test[num_cols+cat_count_cols].values,X_t_cat,]

X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()


In [109]:
test.head()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_car_03_cat_count,ps_car_04_cat_count,ps_car_05_cat_count,ps_car_06_cat_count,ps_car_07_cat_count,ps_car_08_cat_count,ps_car_09_cat_count,ps_car_10_cat_count,ps_car_11_cat_count,new_ind_count
0,0,0,1,8,2,1,0,1,0,0,...,1028142,1241334,666910,295574,1383070,1238365,883326,1475460,27800,164
1,1,4,2,5,2,1,0,0,0,1,...,1028142,1241334,389558,329890,1383070,1238365,486510,1475460,61062,8
2,2,5,1,3,1,1,0,0,0,1,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,10957,57
3,3,0,1,6,1,1,1,0,0,0,...,1028142,1241334,666910,295574,1383070,1238365,883326,1475460,12391,477
4,4,5,1,7,1,1,0,0,0,1,...,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,18416,45


In [137]:
def model_lgb(x_matrix,X_test,cv_folds=5 ,learning_rate = 0.1,num_leaves = 15,min_data_in_leaf = 2000, verbose_eval=100,num_boost_round = 10000,feature_fraction = 0.6,early_stopping_rounds=100):
    useTrainCV = True
    k_fold = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=218)
    k_folds = kfold.split(x_matrix, Y_train)
    x_score = []
    final_cv_train = np.zeros(len(Y_train))
    final_cv_pred = np.zeros(len(test_id))
    for i in range(16):
        cv_train = np.zeros(len(Y_train))
        cv_pred = np.zeros(len(test_id))
        params = {"objective": "binary","boosting_type": "gbdt","learning_rate": learning_rate,"num_leaves": num_leaves,"max_bin": 256,"feature_fraction": feature_fraction,"verbosity": 0,
                          "drop_rate": 0.1,"is_unbalance": False,"max_drop": 50,"min_child_samples": 10,"min_child_weight": 150,"min_split_gain": 0,"subsample": 0.9}
        params['seed'] = i
        if useTrainCV == True:
            best_trees = []
            fold_scores = []
            for i, (train_fold, val) in enumerate(k_folds):
                x_train, x_val, y_train, y_val = x_matrix[train_fold, :], x_matrix[val, :], Y_train[train_fold], Y_train[val]
                dtrain = lgb.Dataset(x_train, y_train)
                dvalid = lgb.Dataset(x_val, y_val, reference=dtrain)
                start_time = datetime.datetime.now()
                bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=eval_error, verbose_eval=verbose_eval,early_stopping_rounds=early_stopping_rounds)
                best_trees.append(bst.best_iteration)
                cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
                cv_train[val] += bst.predict(x_val)
                score = eval_gini(y_val, cv_train[val])
                print(score)
                fold_scores.append(score)
            cv_pred /= cv_folds
            final_cv_train += cv_train
            final_cv_pred += cv_pred

            print("cv score:")
            print(eval_gini(Y_train, cv_train))
            print("current score:", eval_gini(Y_train, final_cv_train / (i + 1.)), i+1)
            print(fold_scores)
            print(best_trees, np.mean(best_trees))

            x_score.append(eval_gini(Y_train, cv_train))
            end_time = datetime.datetime.now()
            print('Training Done..., Time Cost: %d' % ((end_time - start_time).seconds))
            
        
        

    print(x_score)
    pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('lgbm3_pred_avg.csv', index=False)
    pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('lgbm3_cv_avg.csv', index=False)
        
    
    

In [138]:
model_lgb(X,X_test,cv_folds=5,learning_rate = 0.05,num_leaves = 15,min_data_in_leaf = 2000, verbose_eval=100,num_boost_round = 10000,feature_fraction = 0.6,early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.151697	valid_0's gini: 0.287718
[200]	valid_0's binary_logloss: 0.151397	valid_0's gini: 0.294515
[300]	valid_0's binary_logloss: 0.151331	valid_0's gini: 0.295828
[400]	valid_0's binary_logloss: 0.151298	valid_0's gini: 0.296787
[500]	valid_0's binary_logloss: 0.151293	valid_0's gini: 0.296915
Early stopping, best iteration is:
[489]	valid_0's binary_logloss: 0.151281	valid_0's gini: 0.29719
0.29719018091870875
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152373	valid_0's gini: 0.26567
[200]	valid_0's binary_logloss: 0.152188	valid_0's gini: 0.27124
[300]	valid_0's binary_logloss: 0.152172	valid_0's gini: 0.272277
Early stopping, best iteration is:
[252]	valid_0's binary_logloss: 0.152152	valid_0's gini: 0.272391
0.27239088469274586
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152235	val