In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re
import warnings

import lightgbm as lgb
from unidecode import unidecode
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn import metrics
from itertools import combinations
from datetime import datetime

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 50
plt.style.use('ggplot')
import os
%matplotlib inline

In [2]:
data_version = "09-10-2020-full"

In [3]:
df_train = pd.read_csv("../../data/kalapa/train.csv")
df_test = pd.read_csv("../../data/kalapa/test.csv")
df_all = df_train.drop(['label'], 1).append(df_test)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73411 entries, 0 to 20380
Columns: 194 entries, id to Field_82
dtypes: float64(133), int64(2), object(59)
memory usage: 109.2+ MB


In [4]:
# Process date/datetime fields
DATE = ["Field_{}".format(i) for i in [5, 6, 7, 8, 9, 11, 15, 25, 32, 33, 34, 35, 40]]
DATETIME = ["Field_{}".format(i) for i in [1, 2, 43, 44]]

df_all[DATE + DATETIME + ["Field_34", "ngaySinh"]].sample(10)

Unnamed: 0,Field_5,Field_6,Field_7,Field_8,Field_9,Field_11,Field_15,Field_25,Field_32,Field_33,Field_34,Field_35,Field_40,Field_1,Field_2,Field_43,Field_44,Field_34.1,ngaySinh
15279,9/1/2015,9/16/2015,9/1/2015,12/31/2015,9/16/2015,,9/17/2015,9/16/2015,9/16/2015,,20150902.0,,,2017-03-17T13:14:12.293Z,2017-03-17T13:14:12.293Z,2017-03-17T11:04:44.597Z,2019-05-29T02:59:43.104Z,20150902.0,19910621.0
16997,,,,,,,,,,,,,,,,,,,
5113,,,,,,,,,,,,,,,,,,,
50757,2019-12-26,2019-12-26,2020-01-01,2020-12-31,2019-12-26,,2019-12-27,2019-12-26,2019-12-27,2015-01-01,20190202.0,,2020-01-01,2019-12-26T10:39:10.224Z,2019-12-27T03:07:12Z,2017-01-03T16:02:20.307Z,2019-10-08T19:38:02Z,20190202.0,19790806.0
49282,,,,,,,,,,,,,,,,,,,
11146,,,,,,,,,,,,,,,,,,,
51671,,,,,,,,,,,,,,,,,,,
9358,,,,,,,,,,,,,,,,,,,
2755,3/22/2019,3/22/2019,3/1/2019,9/30/2019,3/22/2019,9/30/2019,3/22/2019,3/22/2019,3/22/2019,2/1/2024,20190302.0,9/30/2019,3/1/2019,2019-10-02T06:27:57.138Z,2019-10-02T06:27:57.138Z,2017-03-03T15:16:30.327Z,2018-11-05T03:29:34.615Z,20190302.0,19960218.0
1538,,,,,,,,,,,,,,,,,,,


In [5]:
def correct_34_ngaysinh(s):
    if s != s:
        return np.nan
    try:
        s = int(s)
    except ValueError:
        s = s.split(" ")[0]
        
    return datetime.strptime(str(s)[:6], "%Y%m")

def datetime_normalize(s):
    if s != s:
        return np.nan
    
    s = s.split(".")[0]
    if s[-1] == "Z":
        s = s[:-1]
        
    date, time = s.split("T")
    datetime_obj = datetime.strptime(s, "%Y-%m-%dT%H:%M:%S")
    return datetime_obj

def date_normalize(s):
    if s != s:
        return np.nan
    
    try:
        datetime_obj = datetime.strptime(s, "%m/%d/%Y")
    except:
        datetime_obj = datetime.strptime(s, "%Y-%m-%d")
        
    return datetime_obj

def process_datetime_cols(df):
    cat_cols = []
    for col in DATETIME:
        df[col] = df[col].apply(datetime_normalize)
        
    for col in DATE:
        if col == "Field_34":
            continue
        df[col] = df[col].apply(date_normalize)

    df["Field_34"] = df["Field_34"].apply(correct_34_ngaysinh)
    df["ngaySinh"] = df["ngaySinh"].apply(correct_34_ngaysinh)

    cat_cols += DATE + DATETIME
    for col in DATE + DATETIME:
        df[col] = df[col].dt.strftime('%m-%Y')
    
    for cat in ['F', 'E', 'C', 'G', 'A']:
        df[f'{cat}_startDate'] = pd.to_datetime(df[f"{cat}_startDate"], infer_datetime_format=True)
        df[f'{cat}_endDate'] = pd.to_datetime(df[f"{cat}_endDate"], infer_datetime_format=True)
        
        df[f'{cat}_startDate'] = df[f'{cat}_startDate'].dt.strftime('%m-%Y')
        df[f'{cat}_endDate'] = df[f'{cat}_endDate'].dt.strftime('%m-%Y')
        
        cat_cols.append(f'{cat}_startDate')
        cat_cols.append(f'{cat}_endDate')
    
    for col in cat_cols:
        df[col] = df[col].astype("category")
        
    return df

In [6]:
def str_normalize(s):
    s = str(s).strip().lower()
    s = re.sub(' +', " ", s)
    return s

def process_location(df):
    for col in ["currentLocationLocationId", "homeTownLocationId", "currentLocationLatitude", "currentLocationLongitude", 
                   "homeTownLatitude", "homeTownLongitude"]:
        df[col].replace(0, np.nan, inplace=True)

    df["currentLocationLocationId"] = df["currentLocationLocationId"].apply(str_normalize).astype("category")
    df["homeTownLocationId"] = df["homeTownLocationId"].apply(str_normalize).astype("category")

    return df

In [7]:
def job_category(x):
    if type(x) == str:
        if "công nhân" in x or "cnv" in x or "cn" in x or "may công nghiệp" in x or "lao động" in x\
        or "thợ" in x or "coõng nhaõn trửùc tieỏp maựy may coõng nghieọp" in x or "c.n" in x or "lđ" in x:
            return "CN"
        elif "giáo viên" in x or "gv" in x or "gíao viên" in x:
            return "GV"
        elif "nhân viên" in x or "kế toán" in x or "cán bộ" in x or "nv" in x or "cb" in x or "nhõn viờn" in x:
            return "NV"
        elif "tài xế" in x or "lái" in x or "tài xê" in x:
            return "TX"
        elif "quản lý" in x or "phó phòng" in x or "hiệu phó" in x:
            return "QL"
        elif "undefined" in x:
            return "missing"
        elif "giám đốc" in x or "hiệu trưởng" in x:
            return "GĐ"
        elif "phục vụ" in x:
            return "PV"
        elif "chuyên viên" in x:
            return  "CV"
        elif "bác sĩ" in x or "dược sĩ" in x or "y sĩ" in x or "y sỹ" in x:
            return "BS"
        elif "y tá" in x:
            return "YT"
        elif "hộ sinh" in x:
            return "HS"
        elif "chủ tịch" in x:
            return "CT"
        elif "bếp" in x:
            return "ĐB"
        elif "sư" in x:
            return "KS"
        elif "dưỡng" in x:
            return "ĐD"
        elif "kỹ thuật" in x or "kĩ thuật" in x:
            return "KTV"
        elif "diễn viên" in x:
            return "DV"
        else:
            return "missing"
    else:
        return x    
    
def process_diaChi_maCv(df):
    df["maCv"] = df["maCv"].apply(str_normalize).apply(job_category).astype("category")
    return df

In [8]:
def combine_gender(s):
    x, y = s
    
    if x != x and y != y:
        return "nan"
    
    if x != x:
        return y.lower()
    
    return x.lower()

def process_gender(df):
    df["gender"] = df[["gioiTinh", "info_social_sex"]].apply(combine_gender, axis=1).astype("category")
    return df

In [9]:
def process_misc(df):        
    df["subscriberCount"].replace(0, np.nan, inplace=True)
    df["friendCount"].replace(0, np.nan, inplace=True)
    
    df["Field_13"] = df["Field_13"].apply(lambda x: 1 if x == x else 0)
    df["Field_38"] = df["Field_38"].map({0: 0.0, 1: 1.0, "DN": np.nan, "TN": np.nan, "GD": np.nan})
    df["Field_62"] = df["Field_62"].map({"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5, "Ngoài quốc doanh Quận 7": np.nan})
    df["Field_47"] = df["Field_47"].map({"Zezo": 0, "One": 1, "Two": 2, "Three": 3, "Four": 4})
    
    df["Field_27"] = df["Field_27"].replace({0.0: np.nan})
    df["Field_28"] = df["Field_28"].replace({0.0: np.nan})
        
    for col in df.columns:
        if df[col].dtype.name == "object":
            df[col] = df[col].apply(str_normalize).astype("category")
            
    return df

In [10]:
# drop some fields we do not need (homeTown is optionally)
DROP = ["gioiTinh", "info_social_sex", "ngaySinh", "namSinh"] + \
        [f"Field_{c}" for c in [14, 16, 17, 24, 26, 30, 31, 37, 52, 57]]

def transform(df):
    df = process_datetime_cols(df)
    df = process_gender(df)
    df = process_location(df)
    df = process_diaChi_maCv(df)
    df = process_misc(df)
    return df.drop(DROP, 1)

In [11]:
df_all_fe = transform(df_all.copy())
df_all_fe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73411 entries, 0 to 20380
Columns: 181 entries, id to gender
dtypes: category(57), float64(121), int64(3)
memory usage: 81.5 MB


In [12]:
df_fe = df_all_fe.copy()
df_fe.replace([np.inf, -np.inf], 999, inplace=True)

for col in df_fe.columns:
    if df_fe[col].dtype.name == "category":
        if df_fe[col].isnull().sum() > 0:
            df_fe[col] = df_fe[col].cat.add_categories(f'missing_{col}')
            df_fe[col].fillna(f'missing_{col}', inplace=True)
    else:
        df_fe[col].fillna(-1, inplace=True)

y_label = df_train["label"]
train_fe = df_fe[df_fe["id"] < df_train.shape[0]]
test_fe = df_fe[df_fe["id"] >= df_train.shape[0]]

In [13]:
train_dev = pd.concat([df_train.id, df_train.label,train_fe.iloc[:,1:]], axis = 1)
test = test_fe

In [14]:
train_dev.iloc[:,2:]

Unnamed: 0,Field_1,Field_2,Field_3,Field_4,Field_5,Field_6,Field_7,Field_8,Field_9,Field_10,Field_11,Field_12,Field_13,Field_15,Field_18,Field_19,Field_20,Field_21,Field_22,Field_23,Field_25,Field_27,Field_28,Field_29,Field_32,...,partner4_D,partner4_E,partner4_F,partner4_G,partner4_H,partner4_K,partner4_L,partner5_A,partner5_B,partner5_C,partner5_D,partner5_E,partner5_F,partner5_G,partner5_H,partner5_K,partner5_L,brief,num_of_phone,Field_78,Field_79,Field_80,Field_81,Field_82,gender
0,07-2019,07-2019,1.0,gh,12-2018,12-2018,01-2019,07-2019,12-2018,1.0,07-2019,g8,1,missing_Field_15,trung tâm kinh doanh tiền mặt,0.0,4258600.0,4.5,1.0,16.0,12-2018,-1.0,-1.0,0.0,12-2018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,cb1,1.0,-1.0,-1.000000,-1.000000,-1.0,1,male
1,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,missing_Field_32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1.0,-1.0,-1.000000,-1.000000,-1.0,1,
2,01-2019,01-2019,2.0,t1,01-2019,01-2019,01-2019,12-2019,01-2019,1.0,missing_Field_11,,1,01-2019,,0.0,5000000.0,4.5,1.0,10.0,01-2019,-1.0,-1.0,0.0,01-2019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,33.0,10.769445,6.466667,0.0,2,female
3,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,missing_Field_32,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,1.0,33.0,10.769445,6.466667,0.0,3,
4,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,missing_Field_32,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,1.0,33.0,10.769445,6.466667,0.0,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53025,12-2019,12-2019,1.0,gh,12-2019,12-2019,01-2020,12-2020,12-2019,1.0,missing_Field_11,,1,missing_Field_15,,1.0,1490000.0,4.5,1.0,5.0,12-2019,-1.0,-1.0,12.0,12-2019,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,202.0,157.837469,14.000000,-183.0,1,female
53026,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,missing_Field_32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,notfound,1.0,26.0,10.246951,12.500000,2.0,1,female
53027,11-2017,11-2017,2.0,t1,07-2017,07-2017,07-2017,11-2017,07-2017,1.0,11-2017,g8,1,07-2017,36493assyd,0.0,4015000.0,4.5,1.0,0.0,07-2017,-1.0,-1.0,0.0,07-2017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,notfound,1.0,-1.0,-1.000000,-1.000000,-1.0,1,female
53028,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,missing_Field_32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,notfound,1.0,-1.0,-1.000000,-1.000000,-1.0,4,


In [15]:
test.sample(10)

Unnamed: 0,id,Field_1,Field_2,Field_3,Field_4,Field_5,Field_6,Field_7,Field_8,Field_9,Field_10,Field_11,Field_12,Field_13,Field_15,Field_18,Field_19,Field_20,Field_21,Field_22,Field_23,Field_25,Field_27,Field_28,Field_29,...,partner4_D,partner4_E,partner4_F,partner4_G,partner4_H,partner4_K,partner4_L,partner5_A,partner5_B,partner5_C,partner5_D,partner5_E,partner5_F,partner5_G,partner5_H,partner5_K,partner5_L,brief,num_of_phone,Field_78,Field_79,Field_80,Field_81,Field_82,gender
5003,58033,12-2018,06-2019,1.0,gh,12-2018,12-2018,01-2019,12-2019,12-2018,1.0,missing_Field_11,,1,12-2018,,0.0,3780000.0,4.5,1.0,16.0,12-2018,-1.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1.0,198.0,97.338511,13.25,-164.0,2,male
3341,56371,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,-1.0,-1.0,-1.0,-1.0,4,male
18063,71093,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,1.0,35.0,10.252494,3.75,0.0,3,
9236,62266,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,-1.0,-1.0,-1.0,-1.0,1,
17249,70279,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,notfound,1.0,2.0,6.823977,-2.166667,-16.0,2,
8481,61511,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1.0,-126.0,-1.0,-126.0,-126.0,1,
2341,55371,09-2019,09-2019,2.0,t1,09-2019,09-2019,09-2019,12-2019,09-2019,1.0,missing_Field_11,,1,09-2019,.,0.0,4473000.0,4.5,1.0,12.0,09-2019,-1.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1cy,1.0,9.0,82.883117,-25.111111,-246.0,1,female
8002,61032,missing_Field_1,missing_Field_2,-1.0,,missing_Field_5,missing_Field_6,missing_Field_7,missing_Field_8,missing_Field_9,-1.0,missing_Field_11,,0,missing_Field_15,,-1.0,-1.0,-1.0,-1.0,-1.0,missing_Field_25,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,1,
380,53410,12-2016,12-2016,2.0,t1,08-2012,08-2012,08-2012,12-2012,08-2012,1.0,missing_Field_11,,1,08-2013,,1.8,2682000.0,4.5,1.0,0.0,08-2013,-1.0,-1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2cy,1.0,13.0,9.154234,-1.0,-26.0,4,male
11840,64870,09-2019,09-2019,2.0,t1,09-2019,09-2019,09-2019,09-2020,09-2019,1.0,missing_Field_11,,1,missing_Field_15,,0.0,1043000.0,4.5,1.0,12.0,09-2019,-1.0,-1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,-1.0,-1.0,-1.0,-1.0,4,female


In [16]:
def ginicof(y, preds):
    fpr, tpr, thresholds = metrics.roc_curve(y, preds, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    ginicof = 2 * auc - 1
    return ginicof

In [17]:
random_state = 42
np.random.seed(random_state)

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'verbose': 1,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.7,
    'min_data_in_leaf': 200,
    'bagging_fraction': 0.8,
    'bagging_freq': 20,
    'min_hessian': 0.01,
    'feature_fraction_seed': 2,
    'bagging_seed': 3,
    "seed": random_state
}

In [18]:
iter_predicttion = []
for i in range(2):
    pred_stack = []
    kf = KFold(n_splits = 5, shuffle=True)
    fold = kf.split(train_dev)
    for train_index, dev_index in fold:
        train1 = train_dev.iloc[train_index,2:]
        dev1 = train_dev.iloc[dev_index,2:]

        d_train = lgb.Dataset(train1, label = train_dev.loc[train_index]["label"])
        clf = lgb.train(lgb_params,d_train, 2500)
        predictions_test = clf.predict(test.iloc[:,1:])
        predictions_dev = clf.predict(dev1)
        print(ginicof(train_dev.loc[dev_index]["label"], predictions_dev)) 
        pred_stack.append(predictions_test)
    predictions = np.asarray(pred_stack)
    predictions = np.mean(predictions, axis = 0)

    iter_predicttion.append(predictions)

0.45246033389614637
0.4255479699357436
0.4407548835087822
0.4586667011960528
0.4375780834548102
0.4435549595202537
0.4250258001366034
0.44313392742922186
0.45952249641053977
0.42937401585428336


In [19]:
predictions = np.asarray(iter_predicttion)
predictions = np.mean(predictions, axis = 0)
predictions = list(predictions)

for i in range(len(predictions)):
    if predictions[i] >= 0.5:
        predictions[i] = 1
    else:
        predictions[i] = 0
        
new_label = pd.DataFrame(predictions, columns = ["label"])
new_data =  pd.concat([df_test.id, new_label, test.iloc[:,1:]], axis = 1)

In [29]:
train = pd.concat([train_dev,new_data ], axis = 0)

In [30]:
clf = lgb.train(lgb_params,d_train, 2500)
d_train = lgb.Dataset(train.iloc[:,2:], label = train["label"])
predictions_test = clf.predict(test.iloc[:,1:])

In [34]:
test["label"] = predictions_test
test[["id", "label"]].to_csv("test_preds.csv", index=False)

In [22]:

os.makedirs(f"../../data/kalapa/{data_version}/", exist_ok=True)
train_dev.to_csv(f"../../data/kalapa/{data_version}/train.csv", index = False)
test.to_csv(f"../../data/kalapa/{data_version}/test.csv", index = False)
new_data.to_csv(f"../../data/kalapa/{data_version}/new_data.csv", index = False)

In [None]:
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
oof = train_fe[['id', 'label']]
oof['predict'] = 0
val_aucs = []
"""

In [None]:
"""
features = [c for c in train_fe.columns if c not in ['id', 'label']]

len_train = len(train_fe)
print(len_train)
train_fe['label'] = df_train.label
train_fe = train_fe.append(test_fe).reset_index(drop = True)
print(len_train)
train_fe['label'] = train_fe['label'].fillna(0)
"""

In [None]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(train_fe, train_fe['label'])):
    X_train, y_train = train_fe.iloc[trn_idx][features], train_fe.iloc[trn_idx]['label']
    X_valid, y_valid = train_fe.iloc[val_idx][features], train_fe.iloc[val_idx]['label']
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    evals_result = {}
    lgb_clf = lgb.train(lgb_params,
                        trn_data,
                        7500,
                        valid_sets=[val_data],
                        early_stopping_rounds=100,
                        verbose_eval=50,
                        evals_result=evals_result)

    p_valid = lgb_clf.predict(X_valid[features], num_iteration=lgb_clf.best_iteration)

    oof['predict'][val_idx] = p_valid
    val_score = roc_auc_score(y_valid, p_valid)
    print(f"gini {ginicof(y_valid, p_valid)}")
    val_aucs.append(val_score)

In [None]:
"""
params = {}
params['learning_rate'] = 0.01
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.1
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10
"""

In [None]:
"""
predictions = np.asarray(iter_predicttion)
predictions = np.mean(predictions, axis = 0)
predictions = list(predictions)

for i in range(len(predictions)):
    if predictions[i] >= 0.5:
        predictions[i] = 1
    else:
        predictions[i] = 0
        
new_label = pd.DataFrame(predictions, columns = ["label"])

new_data_id = []
for i in range(len(new_label)):
    idx = len(df_train) + i
    new_data_id.append(idx)
new_data_id = pd.DataFrame(new_data_id, columns = ["id"])


"""