In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv('/kaggle/input/porto-seguro-safe-driver-prediction/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/porto-seguro-safe-driver-prediction/test.csv', index_col='id')
submission = pd.read_csv('/kaggle/input/porto-seguro-safe-driver-prediction/sample_submission.csv', index_col='id')

In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) # 타깃 값 제거

In [3]:
all_features = all_data.columns.tolist()

In [4]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [col for col in all_features if ('cat' in col)]

onehot_encoder = OneHotEncoder() # 원-핫 인코더 객체 생성
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features]) # 원-핫 인코딩 적용

encoded_cat_matrix

<1488028x184 sparse matrix of type '<class 'numpy.float64'>'
	with 20832392 stored elements in Compressed Sparse Row format>

In [5]:
from scipy import sparse

remaining_features = [col for col in all_features if ('cat' not in col and 'calc' not in col)]

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data[remaining_features]),
                               encoded_cat_matrix],
                              format='csr')

In [6]:
num_train = train.shape[0] # 훈련 데이터 개수

# 훈련 데이터와 테스트 데이터 나누기
X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

In [7]:
import numpy as np

def eval_gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [8]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds), True

In [9]:
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold

save_cv = True
full_train = False
seed_rounds = 3

NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

num_boost_round = 10000
# params = {"objective": "binary",
#           "boosting_type": "gbdt",
#           'metric': 'auc',
#           "learning_rate": 0.01,
#           "num_leaves": 31,
#            "max_bin": 256,
#           "feature_fraction": 0.7,
#           "verbosity": 0,
#           "drop_rate": 0.1,
#           "is_unbalance": False,
#           "max_drop": 50,
#           "bagging_freq": 1,
#           "bagging_fraction": 0.7,
#           "lambda_l1": 1,
#           "lambda_l2": 1,
#           "min_child_samples": 10,
#           "min_child_weight": 150,
#           "min_split_gain": 0,
#           "subsample": 0.9,
#           "random_state": 218,
#           }

params = {"objective": "binary",
          "boosting_type": "gbdt",
          'metric': 'auc',
          "learning_rate": 0.01,
          "force_row_wise": True
          }

In [10]:
x_score = []

cv_train = np.zeros(X.shape[0])
cv_pred = np.zeros(X_test.shape[0])

best_iters = []
fold_scores = []

for train_idx, valid_idx in kfold.split(X, y):
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid)

    lgb_model = lgbm.train(params, dtrain,  
                           valid_sets=dvalid, 
                           feval=gini, 
                           num_boost_round=1400,
                           verbose_eval=100, early_stopping_rounds=150)
    best_iters.append(lgb_model.best_iteration)
    ########### best_iteration+10으로 돌려보기
    cv_pred += lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    cv_train[valid_idx] += lgb_model.predict(X_valid)

    score = eval_gini(y_valid, cv_train[valid_idx])
    print(score)
    fold_scores.append(score)

cv_pred /= NFOLDS

print("cv score:")
print(eval_gini(y, cv_train))
print("current score:", eval_gini(y, cv_train))
print(fold_scores)
print(best_iters, np.mean(best_iters))

x_score.append(eval_gini(y, cv_train))

[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1360
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 206
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 150 rounds
[100]	valid_0's auc: 0.631578	valid_0's gini: 0.263155
[200]	valid_0's auc: 0.638582	valid_0's gini: 0.277164
[300]	valid_0's auc: 0.641891	valid_0's gini: 0.283783
[400]	valid_0's auc: 0.643605	valid_0's gini: 0.28721
[500]	valid_0's auc: 0.644678	valid_0's gini: 0.289356
[600]	valid_0's auc: 0.64516	valid_0's gini: 0.290319
[700]	valid_0's auc: 0.645452	valid_0's gini: 0.290903
[800]	valid_0's auc: 0.645482	valid_0's gini: 0.290964
[900]	valid_0's auc: 0.645538	valid_0's gini: 0.291075
Early stopping, best iteration is:
[839]	valid_0's auc: 0.645571	valid_0's gini: 0.291143
0.2911428953390811
[

In [11]:
print(x_score)

[0.2813381669133965]


- 1: [0.2813381669133965]


In [12]:
submission['target'] = cv_pred
submission.to_csv('submission.csv')

In [13]:
submission

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0.027479
1,0.024734
2,0.028488
3,0.015355
4,0.037178
...,...
1488022,0.080100
1488023,0.042489
1488024,0.037148
1488025,0.024706
