In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from category_encoders import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from warnings import filterwarnings
from sklearn.impute import SimpleImputer
filterwarnings('ignore')
import os
print(os.listdir("../input"))

['train.csv', 'sample_submission.csv', 'test.csv']


In [2]:
train = pd.read_csv("../input/train.csv")
print("train shape", train.shape)
test = pd.read_csv("../input/test.csv")
print("test shape", test.shape)

train shape (400665, 36)
test shape (399335, 35)


In [3]:
target_column = "target"
id_column = "id"
categorical_cols = [c for c in test.columns if test[c].dtype in [np.object]]
numerical_cols = [c for c in test.columns if test[c].dtype in [np.float, np.int] and c not in [target_column, id_column]]
print("Number of features", len(categorical_cols)+len(numerical_cols))

Number of features 34


In recent sklearn they introduced ColumnTransformer which is a very compact way to define end-2-end solution.

In [4]:
classifier = make_pipeline(
    ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols),    
    ]),
    LGBMClassifier(n_jobs=-1)
)

In [5]:
%%time
oof_pred = cross_val_predict(classifier, 
                             train, 
                             train[target_column], 
                             cv=5,
                             method="predict_proba")

CPU times: user 1min 37s, sys: 7.16 s, total: 1min 44s
Wall time: 41.9 s


In [6]:
print("Cross validation AUC {:.4f}".format(roc_auc_score(train[target_column], oof_pred[:,1])))

Cross validation AUC 0.6221


In [7]:
sub = pd.read_csv("../input/sample_submission.csv")
sub.head()

Unnamed: 0,id,target
0,2,0
1,4,0
2,6,0
3,7,0
4,9,0


In [8]:
%%time
classifier.fit(train, train[target_column])
test_preds = classifier.predict_proba(test)[:,1]
sub[target_column] = test_preds
sub.to_csv("submission.csv", index=False)

CPU times: user 30.4 s, sys: 2.52 s, total: 32.9 s
Wall time: 14.8 s
