In [1]:
# 1. FASTAI 
# 2. EMB + LIGHTGBM 
# 3. LOGISTIC
# 4. deepfm
# 5. catboost
# 6. h2o

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import LabelEncoder
import gc

In [3]:
TARGET = 'target'
random_state = 0 
n_estimators = 10000
learning_rate=0.01

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
X = train.drop(['id','target'], axis = 1)
categorical_features = [col for c, col in enumerate(X.columns) \
                        if not ( np.issubdtype(X.dtypes[c], np.number )  )  ]
y = train['target']
print( len(categorical_features), X.shape, y.shape, y.mean()  )
for f in categorical_features:
    X[f] = X[f].astype('category')

17 (600000, 23) (600000,) 0.187205


In [6]:
# for making train - valid sets
from sklearn.model_selection import train_test_split

#Split in 80% train and 20% test set
train_df, val_df = train_test_split(train, test_size = 0.2, shuffle=False)

#Define 'y' labels
train_y = train_df.target
val_y = val_df.target

#Define 'x' sets
train_x = train_df.drop(['id','target'], axis = 1)
val_x = val_df.drop(['id','target'], axis = 1)

categorical_features = [col for c, col in enumerate(train_x.columns) \
                        if not ( np.issubdtype(train_x.dtypes[c], np.number )  )  ]

for f in categorical_features:
    train_x[f] = train_x[f].astype('category')
    val_x[f] = val_x[f].astype('category')


In [7]:
train_data = lgb.Dataset(data=train_x, label=train_y, categorical_feature = X.columns.tolist(), free_raw_data=False)
val_data = lgb.Dataset(data=val_x, label=val_y, categorical_feature = X.columns.tolist(), free_raw_data=False)

In [8]:
params = {'objective':'binary',
        'num_iterations':15000, 
        'early_stopping_round':100, 
        'metric':'auc',
        'num_leaves': round(int(24.23)),
        'learning_rate':0.02407,
        'feature_fraction': 0.102,
        'bagging_fraction': 0.8264,
        'max_depth': round(int(5.608)),
        'lambda_l1': 4.948,
        'lambda_l2': 0.06873,
        'min_split_gain': 0.08549,
        'min_child_weight': 47.9,
        'seed':random_state}

In [9]:
lgbm = lgb.train(params,
                 train_data,
                 num_boost_round=40000,
                 valid_sets=val_data,
                 early_stopping_rounds=100,
                 verbose_eval=200,
                 )

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's auc: 0.77852
[400]	valid_0's auc: 0.782627
[600]	valid_0's auc: 0.783946
[800]	valid_0's auc: 0.784686
[1000]	valid_0's auc: 0.784868
Early stopping, best iteration is:
[977]	valid_0's auc: 0.784938


In [10]:
y_val_pred = lgbm.predict(val_x, num_iteration=lgbm.best_iteration)

In [11]:
y_val_pred

array([0.14577832, 0.06454615, 0.01973111, ..., 0.12721572, 0.19804599,
       0.1152337 ])

In [12]:
y_val_pred.shape

(120000,)

In [20]:
X_test = test.drop(['id'], axis = 1)

for f in categorical_features:
    X_test[f] = X_test[f].astype('category')

In [21]:
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)

In [17]:
val_pred = pd.DataFrame(y_val_pred, columns=['val'])
val_pred.to_csv('2_val.csv', index=False)

In [26]:
test_pred = pd.DataFrame(y_pred, columns=['test'])
test_pred.to_csv('2_pred.csv', index=False)