In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/00/37/a392e669a83fef72b916009c438a924d2a9d70bc8aea62662b207105ed98/lightgbm-2.2.3-py2.py3-none-win_amd64.whl (515kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.3


You are using pip version 18.1, however version 19.1.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn import model_selection
from sklearn.metrics import roc_auc_score

In [12]:
data = pd.read_csv("../data/titanic.csv", index_col="PassengerId")

In [13]:
for c in data.select_dtypes("object").columns:
    data[c] = data[c].astype("category")

In [14]:
data.dtypes

Survived       int64
Pclass         int64
Name        category
Sex         category
Age          float64
SibSp          int64
Parch          int64
Ticket      category
Fare         float64
Cabin       category
Embarked    category
dtype: object

In [15]:
target = "Survived"
X_train, X_test, y_train, y_test = model_selection.train_test_split(data.drop(target, axis=1), 
                                                                    data[target], test_size=0.1, random_state=0)

In [16]:
kf = model_selection.KFold(n_splits=5)
folds = [(y_train.index[train_idx], 
          y_train.index[valid_idx]) for train_idx, valid_idx 
         in kf.split(X_train)]

In [17]:
valid_probs = []
test_probs = []
for i, (train_idx, valid_idx) in enumerate(folds):
    Xt = X_train.loc[train_idx]
    yt = y_train.loc[train_idx]

    Xv = X_train.loc[valid_idx]
    yv = y_train.loc[valid_idx]

    learner = LGBMClassifier(n_estimators=10000)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="auc",
                eval_set=[(Xt, yt),
                          (Xv, yv)])
    probs = pd.Series(learner.predict_proba(Xv)[:, -1], index=Xv.index)
    valid_probs.append(probs)
    probs = pd.Series(learner.predict_proba(X_test)[:, -1],
                      index=X_test.index, name="fold_" + str(i))
    test_probs.append(probs)
valid_probs = pd.concat(valid_probs)
test_probs = pd.concat(test_probs, axis=1).mean(axis=1)

[1]	training's auc: 0.90246	training's binary_logloss: 0.619434	valid_1's auc: 0.858047	valid_1's binary_logloss: 0.611604
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.90653	training's binary_logloss: 0.581155	valid_1's auc: 0.855938	valid_1's binary_logloss: 0.580171
[3]	training's auc: 0.9085	training's binary_logloss: 0.549875	valid_1's auc: 0.856781	valid_1's binary_logloss: 0.555068
[4]	training's auc: 0.908537	training's binary_logloss: 0.52376	valid_1's auc: 0.857456	valid_1's binary_logloss: 0.533216
[5]	training's auc: 0.910033	training's binary_logloss: 0.501282	valid_1's auc: 0.861758	valid_1's binary_logloss: 0.51395
[6]	training's auc: 0.910739	training's binary_logloss: 0.48247	valid_1's auc: 0.863276	valid_1's binary_logloss: 0.498756
[7]	training's auc: 0.914242	training's binary_logloss: 0.465714	valid_1's auc: 0.863023	valid_1's binary_logloss: 0.485219
[8]	training's auc: 0.914366	training's binary_logloss: 0.450718	valid_1's au

[1]	training's auc: 0.899489	training's binary_logloss: 0.620561	valid_1's auc: 0.876288	valid_1's binary_logloss: 0.605054
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.902998	training's binary_logloss: 0.582995	valid_1's auc: 0.881954	valid_1's binary_logloss: 0.570798
[3]	training's auc: 0.908321	training's binary_logloss: 0.551672	valid_1's auc: 0.87397	valid_1's binary_logloss: 0.542879
[4]	training's auc: 0.912406	training's binary_logloss: 0.525698	valid_1's auc: 0.875343	valid_1's binary_logloss: 0.519634
[5]	training's auc: 0.913495	training's binary_logloss: 0.502846	valid_1's auc: 0.874313	valid_1's binary_logloss: 0.501434
[6]	training's auc: 0.913274	training's binary_logloss: 0.483969	valid_1's auc: 0.874828	valid_1's binary_logloss: 0.486133
[7]	training's auc: 0.916968	training's binary_logloss: 0.466163	valid_1's auc: 0.874141	valid_1's binary_logloss: 0.471631
[8]	training's auc: 0.919486	training's binary_logloss: 0.450522	valid_

In [18]:
roc_auc_score(y_test, test_probs)

0.8758169934640523

In [9]:
testeo = test_probs.rename("prob").to_frame().join(data[target])
pd.crosstab(pd.qcut(testeo.prob, 5), testeo[target])

Survived,0,1
prob,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.143, 0.202]",17,1
"(0.202, 0.273]",17,1
"(0.273, 0.382]",13,5
"(0.382, 0.722]",3,15
"(0.722, 0.784]",1,17


In [10]:
pd.Series(learner.feature_importances_, index=Xt.columns).sort_values()

Parch       0
Age_nul     0
SibSp       1
Q           2
S           2
C           7
Cabin       8
Sex         9
Pclass     12
NumFam     17
Ticket     39
Age        41
Fare       78
dtype: int32