# import包

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

# 读取数据

In [2]:
train = pd.read_csv('./train.csv')
test  = pd.read_csv('./test.csv')

In [3]:
train.head()

Unnamed: 0,id,user_id,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,label
0,0,6,0.745418,1.133039,0.874877,0.260652,-0.097008,-0.282033,1
1,1,6,0.840123,1.474252,-1.099343,0.433683,-0.141592,1.104046,1
2,2,3,0.074803,1.938907,-0.161942,1.320775,-0.308221,0.722852,1
3,3,7,1.344972,0.930487,-1.76547,-0.319775,0.007126,0.124472,1
4,4,5,-0.481917,0.589874,-0.633376,0.769459,-0.153542,-0.71821,1


In [4]:
test.head()

Unnamed: 0,id,user_id,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
0,0,9,-0.911395,-1.005623,1.648673,-0.049366,0.056513,0.139963
1,1,8,1.344412,0.599836,-1.650746,-0.553898,0.061133,1.297092
2,2,6,-0.108709,1.941218,-1.410857,1.456096,-0.330334,-0.831727
3,3,8,0.848378,1.206676,-0.643029,0.237879,-0.096856,-0.20236
4,4,8,2.444736,0.246238,-0.824452,-1.606255,0.249277,-2.291472


In [5]:
target = 'label'
used_features = [x for x in train.columns if x not in ['id', target]]

# KFold

In [6]:
sub = test[['id']]
sub[target] = 0
AUCs = []

n_fold = 5
folds = KFold(n_splits = n_fold)

for train_index, valid_index in folds.split(train[used_features]):
    
    trn_x, trn_y = train[used_features].iloc[train_index], train[target].iloc[train_index]
    val_x, val_y = train[used_features].iloc[valid_index], train[target].iloc[valid_index]
    
    model = LogisticRegression()
    model.fit(trn_x, trn_y)
    
    val_pred = model.predict(val_x)
    
    pred = model.predict(test[used_features])
    sub[target] = sub[target] + pred / n_fold
    
    auc_score = roc_auc_score(val_y, val_pred)
    AUCs.append(auc_score)

print(f'mean auc: {np.mean(AUCs)}')

mean auc: 0.9761302541544478


# StratifiedKFold

In [7]:
sub = test[['id']]
sub[target] = 0
AUCs = []

n_fold = 5
skf = StratifiedKFold(n_splits = n_fold)

for train_index, valid_index in skf.split(train[used_features], train[target]):
    
    trn_x, trn_y = train[used_features].iloc[train_index], train[target].iloc[train_index]
    val_x, val_y = train[used_features].iloc[valid_index], train[target].iloc[valid_index]
    
    model = LogisticRegression()
    model.fit(trn_x, trn_y)
    
    val_pred = model.predict(val_x)
    
    pred = model.predict(test[used_features])
    sub[target] = sub[target] + pred / n_fold
    
    auc_score = roc_auc_score(val_y, val_pred)
    AUCs.append(auc_score)

print(f'mean auc: {np.mean(AUCs)}')

mean auc: 0.9760330296198303


# GroupKFold

In [8]:
sub = test[['id']]
sub[target] = 0
AUCs = []

n_fold = 5
group_kfold = GroupKFold(n_splits = n_fold)

for train_index, valid_index in group_kfold.split(train[used_features], train[target], train['user_id']):
    
    trn_x, trn_y = train[used_features].iloc[train_index], train[target].iloc[train_index]
    val_x, val_y = train[used_features].iloc[valid_index], train[target].iloc[valid_index]
    
    model = LogisticRegression()
    model.fit(trn_x, trn_y)
    
    val_pred = model.predict(val_x)
    
    pred = model.predict(test[used_features])
    sub[target] = sub[target] + pred / n_fold
    
    auc_score = roc_auc_score(val_y, val_pred)
    AUCs.append(auc_score)

print(f'mean auc: {np.mean(AUCs)}')

mean auc: 0.973618705855548
