In [1]:
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
%%time
df_train = pd.read_csv('data/train_clean.csv')
df_test = pd.read_csv('data/test_clean.csv')

CPU times: user 7.1 s, sys: 607 ms, total: 7.7 s
Wall time: 7.79 s


In [3]:
df_train = df_train[df_train['outliers'] == 0]
target = df_train['target']
del df_train['target']
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','outliers']]
categorical_feats = [c for c in features if 'feature_' in c]

In [4]:
param = {'objective':'regression',
         'num_leaves': 31,
         'min_data_in_leaf': 25,
         'max_depth': 7,
         'learning_rate': 0.01,
         'lambda_l1':0.13,
         "boosting": "gbdt",
         "feature_fraction":0.85,
         'bagging_freq':8,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "verbosity": -1,
         "random_state": 2333}

In [5]:
%%time
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2333)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold 0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 1.60687	valid_1's rmse: 1.61781
[200]	training's rmse: 1.57609	valid_1's rmse: 1.59108
[300]	training's rmse: 1.56139	valid_1's rmse: 1.58041
[400]	training's rmse: 1.55194	valid_1's rmse: 1.57473
[500]	training's rmse: 1.5448	valid_1's rmse: 1.57109
[600]	training's rmse: 1.53907	valid_1's rmse: 1.56879
[700]	training's rmse: 1.53402	valid_1's rmse: 1.56736
[800]	training's rmse: 1.52968	valid_1's rmse: 1.56635
[900]	training's rmse: 1.52575	valid_1's rmse: 1.56572
[1000]	training's rmse: 1.52214	valid_1's rmse: 1.56506
[1100]	training's rmse: 1.51854	valid_1's rmse: 1.56466
[1200]	training's rmse: 1.51517	valid_1's rmse: 1.56431
[1300]	training's rmse: 1.51177	valid_1's rmse: 1.56397
[1400]	training's rmse: 1.50867	valid_1's rmse: 1.56369
[1500]	training's rmse: 1.50515	valid_1's rmse: 1.56342
[1600]	training's rmse: 1.50207	valid_1's rmse: 1.56345
[1700]	training's rmse: 1.49904	valid_1's rm

In [6]:
model_without_outliers = pd.DataFrame({"card_id":df_test["card_id"].values})
model_without_outliers["target"] = predictions

In [7]:
%%time
df_train = pd.read_csv('data/train_clean.csv')
df_test = pd.read_csv('data/test_clean.csv')

CPU times: user 7.13 s, sys: 527 ms, total: 7.66 s
Wall time: 7.73 s


In [8]:
target = df_train['outliers']
del df_train['outliers']
del df_train['target']

In [9]:
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

In [10]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333}

In [11]:
%%time
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

start = time.time()


for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(target, oof)))

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0451366	valid_1's binary_logloss: 0.0477163
[200]	training's binary_logloss: 0.045025	valid_1's binary_logloss: 0.047604
[300]	training's binary_logloss: 0.0449807	valid_1's binary_logloss: 0.0475286
[400]	training's binary_logloss: 0.0449997	valid_1's binary_logloss: 0.0475235
[500]	training's binary_logloss: 0.0450205	valid_1's binary_logloss: 0.0475456
Early stopping, best iteration is:
[309]	training's binary_logloss: 0.0449743	valid_1's binary_logloss: 0.0475139
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.045428	valid_1's binary_logloss: 0.046152
[200]	training's binary_logloss: 0.0453471	valid_1's binary_logloss: 0.0461251
Early stopping, best iteration is:
[63]	training's binary_logloss: 0.0453754	valid_1's binary_logloss: 0.0460797
fold n°2
Training until validation scores don't improve for 200 rounds.
[100]	training's 

In [12]:
### 'target' is the probability of whether an observation is an outlier
df_outlier_prob = pd.DataFrame({"card_id":df_test["card_id"].values})
df_outlier_prob["target"] = predictions
df_outlier_prob.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.045358
1,C_ID_130fd0cbdd,0.001709
2,C_ID_b709037bc5,0.006204
3,C_ID_d27d835a9f,0.001709
4,C_ID_2b5e3df5c2,0.001709


In [13]:
# In case missing some predictable outlier, we choose top 25000 with highest outliers likelyhood.
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by='target',ascending = False).head(25000)['card_id'])

In [19]:
best_submission = pd.read_csv('data/blend/elo-blending/3.695.csv')

In [20]:
most_likely_liers = best_submission.merge(outlier_id,how='right')
most_likely_liers.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.502326
1,C_ID_6d8dba8475,-0.893964
2,C_ID_7f1041e8e1,-4.872942
3,C_ID_22e4a47c72,0.393946
4,C_ID_b54cfad8b2,-0.656266


In [21]:
%%time
for card_id in most_likely_liers['card_id']:
    model_without_outliers.loc[model_without_outliers['card_id']==card_id,'target']\
    = most_likely_liers.loc[most_likely_liers['card_id']==card_id,'target'].values

CPU times: user 6min 55s, sys: 4.9 s, total: 7min
Wall time: 46min 15s


In [22]:
model_without_outliers.to_csv("combining_submission.csv", index=False)