<a href="https://colab.research.google.com/github/DONDAJIN/Kaggle_amex/blob/master/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gc
import os
import joblib
import random
import warnings
import itertools
import scipy as sp
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from itertools import combinations
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from sklearn.preprocessing import LabelEncoder
import warnings; warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold, train_test_split

In [2]:
class Config:
    seed = 58
    n_folds = 5
    target = 'target'
    DIR='/content/drive/MyDrive/Kaggle'
    INPUT= os.path.join(DIR,'Input')
    OUTPUT=os.path.join(DIR,'Output')
    EXP=os.path.join(DIR,'exp/exp003')
    MODEL=os.path.join(EXP,'Model')
    LOG=os.path.join(EXP,'Log')
    PRED=os.path.join(EXP,'pred')
    os.makedirs(EXP,exist_ok=True)
    for i in ['Log','Model','pred']:
      os.makedirs(os.path.join(EXP,i),exist_ok=True)


def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def read_data():
    train = pd.read_parquet(os.path.join(Config.INPUT,'train_fe_plus_plus.parquet'))
    test = pd.read_parquet(os.path.join(Config.INPUT,'test_fe_plus_plus.parquet'))
    return train, test

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [3]:
def get_difference(data,num_features):
  df1=[]
  customer_ids=[]
  for customer_id,df in tqdm(data.groupby(['customer_ID'])): #groupbyオブジェクトfor文で回せる
    diff_df1=df[num_features].diff(1).iloc[[-1]].values.astype('float32') #各customerの各列の差の最後の行
    df1.append(diff_df1)
    customer_ids.append(customer_id)
  df1=np.concatenate(df1,axis=0)
  df1=pd.DataFrame(df1,columns=[col+'_diff1' for col in df[num_features].columns])
  df1['customer_ID']=customer_ids
  return df1

In [4]:
def read_preprocess_data():
  test=pd.read_parquet(os.path.join(Config.INPUT,'test_rounded.parquet'))
  features=test.drop(['customer_ID','S_2'],axis=1).columns.to_list() #inplace=False
  cat_features=[
      'B_30',
      'B_38',
      'D_114',
      'D_116',
      'D_117',
      'D_120',
      'D_126',
      'D_63',
      'D_64',
      'D_66',
      'D_68',
  ]
  num_features=[col for col in features if col not in cat_features]

  #Test feature engineering
  test=pd.read_parquet(os.path.join(Config.INPUT,'test_rounded.parquet'))
  print('Start test feature engineering------------')
  test_num_agg=test.groupby('customer_ID')[num_features].agg(['first','mean','std','min','max','last'])
  test_num_agg.columns=['_'.join(x) for x in test_num_agg.columns]
  test_num_agg.reset_index(inplace=True)

  #Lag features
  for col in test_num_agg:
    if 'last' in col and col.replace('last','first') in test_num_agg:
      test_num_agg[col+'_lag_sub']=test_num_agg[col]-test_num_agg[col.replace('last','first')]
      test_num_agg[col+'_lag_div']=test_num_agg[col]/test_num_agg[col.replace('last','first')]

  test_cat_agg=test.groupby('customer_ID')[cat_features].agg(['count','first','last','nunique'])
  test_cat_agg.columns=['_'.join(x) for x in test_cat_agg.columns]
  test_cat_agg.reset_index(inplace=True)

  cols=list(test_num_agg.dtypes[test_num_agg.dtypes=='float64'].index) #dtypesではindexが列名
  for col in tqdm(cols):
    test_num_agg[col]=test_num_agg[col].astype(np.float32)

  cols=list(test_cat_agg.dtypes[test_cat_agg.dtypes=='int64'].index)
  for col in tqdm(cols):
    test_cat_agg[col]=test_cat_agg[col].astype(np.int32)
  
  test_diff=get_difference(test,num_features)
  test=test_num_agg.merge(test_cat_agg,how='inner',on='customer_ID').merge(test_diff,how='inner',on='customer_ID')
  del test_num_agg,test_cat_agg,test_diff
  gc.collect()
  test.to_parquet(os.path.join(Config.INPUT,'test_fe_plus_plus.parquet'))

In [5]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

In [6]:
def train_and_evaluate(train,test):
  # Label encoding
  cat_features=[
      'B_30',
      'B_38',
      'D_114',
      'D_116',
      'D_117',
      'D_120',
      'D_126',
      'D_63',
      'D_64',
      'D_66',
      'D_68'
      ]
  cat_features=[f'{cf}_last' for cf in cat_features] #Preprocess後の'B_30_last'などにLabel encodingする
  for cat_col in cat_features:
    encoder=LabelEncoder()
    train[cat_col]=encoder.fit_transform(train[cat_col])
    test[cat_col]=encoder.transform(test[cat_col])

  #Round last float features to 2 decimal place
  num_cols=list(train.dtypes[(train.dtypes=='float32') | (train.dtypes=='float64')].index)
  num_cols=[col for col in num_cols if 'last' in col] #lastが付いている列だけ
  for col in num_cols:
    train[col+'_round2']=train[col].round(2)
    test[col+'_round2']=test[col].round(2)
  
  #Get the difference between last and mean
  num_cols=[col for col in train.columns if 'last' in col] #lastが付いている列だけを取り出して
  num_cols=[col[:-5] for col in num_cols if 'round' not in col] # 'B_2_last'から'B_2'
  for col in num_cols:
    try:
      train[f'{col}_last_mean_diff']=train[f'{col}_last']-train[f'{col}_mean']
      test[f'{col}_last_mean_diff']=test[f'{col}_last']-test[f'{col}_mean']
    except:
      pass
  
  #Transform float 64 and float 32 to float 16
  num_cols=list(train.dtypes[(train.dtypes=='foat32') | (train.dtypes=='float16')].index)
  for col in tqdm(num_cols):
    train[col]=train[col].astype(np.float16)
    test[col]=test[col].astype(np.float16)
  #Get feature list
  features=[col for col in train.columns if col not in ['customer_ID',Config.target]]
  params = {
        'objective': 'binary',
        'metric': "None",
        'boosting': 'dart',
        'seed': Config.seed,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40
        }
  #Creat a numpy array to store test,oof predictions
  test_predictions=np.zeros(len(test))
  oof_predictions=np.zeros(len(train))

  kfold=StratifiedKFold(n_splits=Config.n_folds,shuffle=True,random_state=Config.seed)
  for fold,(tr_idx,val_idx) in enumerate(kfold.split(train,train[Config.target])):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {len(features)} features...')
    X_tr,X_val=train[features].iloc[tr_idx],train[features].iloc[val_idx]
    y_tr,y_val=train[Config.target].iloc[tr_idx],train[Config.target].iloc[val_idx]

    lgb_train=lgb.Dataset(X_tr,y_tr,categorical_feature=cat_features)
    lgb_valid=lgb.Dataset(X_val,y_val,categorical_feature=cat_features)
    model=lgb.train(
        params=params,
        train_set=lgb_train,
        num_boost_round=9500,
        valid_sets=[lgb_train,lgb_valid],
        early_stopping_rounds=100,
        verbose_eval=500,
        feval=lgb_amex_metric,
        fobj=None
        )
    #Save best model
    joblib.dump(model,os.path.join(Config.MODEL,f'lgbm_fold{fold}_seed{Config.seed}.pkl'))

    #Predict validation
    val_pred=model.predict(X_val)

    #Add to oof arrray
    oof_predictions[val_idx]=val_pred

    
    #Compute fold metric
    score=amex_metric(y_val,val_pred)
    print(f'Our Fold {fold} CV score is {score} ')
    del X_tr,X_val,y_tr,y_val,lgb_train,lgb_valid
    gc.collect()
  
  #Compute oof metric
  score=amex_metric(train[Config.target],oof_predictions)
  print(f'Our out of folds CV Score is {score}')

  #Create a dataframe to store out of folds predictions
  oof_df=pd.DataFrame({'customer_ID':train['customer_ID'],'target':train[Config.target],'Prediction':oof_predictions})
  oof_df.to_csv(os.path.join(Config.PRED,'oof_lgbm_baseline_{Config.n_folds}fold_seed{Config.seed}.csv'),index=False)

  

In [7]:
#seed_everything(Config.seed)
#read_preprocess_data()
train,test=read_data()
train_and_evaluate(train,test)

0it [00:00, ?it/s]


 
--------------------------------------------------
Training fold 0 with 2177 features...
[500]	training's amex_metric: 0.781145	valid_1's amex_metric: 0.769182
[1000]	training's amex_metric: 0.793568	valid_1's amex_metric: 0.778786
[1500]	training's amex_metric: 0.806847	valid_1's amex_metric: 0.785046
[2000]	training's amex_metric: 0.818955	valid_1's amex_metric: 0.789775
[2500]	training's amex_metric: 0.831714	valid_1's amex_metric: 0.793729
[3000]	training's amex_metric: 0.842054	valid_1's amex_metric: 0.796304
[3500]	training's amex_metric: 0.850504	valid_1's amex_metric: 0.797093
[4000]	training's amex_metric: 0.85941	valid_1's amex_metric: 0.797707
[4500]	training's amex_metric: 0.869816	valid_1's amex_metric: 0.798946
[5000]	training's amex_metric: 0.878342	valid_1's amex_metric: 0.799765
[5500]	training's amex_metric: 0.887616	valid_1's amex_metric: 0.800506
[6000]	training's amex_metric: 0.895494	valid_1's amex_metric: 0.800645
[6500]	training's amex_metric: 0.902432	valid_1

In [8]:
#CV:0.7975

In [7]:
models=[]
for i in range(5):
  model=joblib.load(os.path.join(Config.MODEL,f'lgbm_fold{i}_seed{Config.seed}.pkl'))
  models.append(model)

In [8]:
#Predict the test set
test=pd.read_parquet(os.path.join(Config.INPUT,'test_fe_plus_plus.parquet'))
features=[col for col in test.columns if col not in ['customer_ID',Config.target]]
test_predictions=np.zeros(len(test))
for model in models:
  test_pred=model.predict(test[features])
  test_predictions+=test_pred/Config.n_folds
#Create a dataframe to store test predictions
test_df=pd.DataFrame({'customer_ID':test['customer_ID'],'prediction':test_predictions})
test_df.to_csv(os.path.join(Config.PRED,f'test_lgbm_baseline_{Config.n_folds}fold_seed{Config.seed}.csv'),index=False)

In [None]:
#LB: 0.787