In [1]:
from lightgbm import LGBMClassifier

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import GridSearchCV
import optuna
from optuna.trial import Trial
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

from typing import List, Any, Tuple
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Data Loading
X_model = pd.read_csv('X_model.csv')
Y_model = pd.read_csv('Y_model.csv')
scaler = MinMaxScaler(feature_range=(0,1))

In [3]:
# Filter outliers by "entire"
# Ref: https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/
print("Filtering outliers...")
df_base = pd.concat([X_model, Y_model], axis=1)
df_processed = pd.DataFrame(data=df_base, columns=['business'])
df_processed['cEntire'] = df_base.filter(regex="c" + r"2022[0-9]*", axis=1).fillna(0).sum(axis=1)
df_processed['tEntire'] = df_base.filter(regex="t" + r"2022[0-9]*", axis=1).fillna(0).sum(axis=1)
df_processed['sEntire'] = df_base.filter(regex="s" + r"2022[0-9]*", axis=1).fillna(0).sum(axis=1)

outliers = []

def collect_outliers(business: int, key: str):
    df_target = df_processed[df_processed['business'] == business]
    q1, q3 = df_target[key].quantile([0.25, 0.75])
    iqr = q3 - q1
    cutoff = iqr * 2.0
    lower, upper = q1 - cutoff, q3 + cutoff
    _outliers = df_target[(df_target[key] < lower) | (df_target[key] > upper)].index.tolist()
    print(len(_outliers))
    outliers.extend(_outliers)

collect_outliers(0, 'cEntire')
collect_outliers(1, 'cEntire')
collect_outliers(0, 'tEntire')
collect_outliers(1, 'tEntire')
collect_outliers(0, 'sEntire')
collect_outliers(1, 'sEntire')

outliers = list(set(outliers))

# Filter outliers from df
def filter_outliers_from_df(df: pd.DataFrame, outliers):
    return df.drop(outliers)

X_model = filter_outliers_from_df(X_model, outliers)
Y_model = filter_outliers_from_df(Y_model, outliers)

# Get columns of "c" prefix of last 5 days of month
last_5_days = [
    *[f"c202201{i + 27}" for i in range(5)],
    *[f"c202202{i + 24}" for i in range(5)],
    *[f"c202203{i + 27}" for i in range(5)],
    *[f"c202204{i + 26}" for i in range(5)],
    *[f"c202205{i + 27}" for i in range(5)],
    *[f"c202206{i + 26}" for i in range(5)],
    *[f"c202207{i + 27}" for i in range(5)],
]
last_5_days_sum = X_model.filter(last_5_days, axis=1).fillna(0).sum(axis=1)
# print(last_5_days_sum.head())
last_5_days_sum = last_5_days_sum.sort_values(ascending=False)
# print(last_5_days_sum.index)

# Define scaler
# print("Defining scaler...")
# scaler = MinMaxScaler(feature_range=(0,1))

# Define preprocessors
print("Defining preprocessors...")
def column(colnames: List[str]):
    def _column(X: pd.DataFrame):
        X = X.fillna(0)
        return [
            [colname, X[colname].values] for colname in colnames
        ]
    return _column

def rangesum(
    name:str, 
    regex: str, 
    prefixes: str, 
    dist: np.ndarray
):
    def _rangesum(X: pd.DataFrame):
        X = X.fillna(0)
        return [
            [
                prefix + name, 
                X.filter(regex=(prefix + regex), axis=1).values.dot(dist)
            ] for prefix in prefixes
        ]
    return _rangesum

def rangesum_from_list(
    name: str, 
    namelist: List[str], 
    prefix: str,
    dist: np.ndarray,    
):
    def _rangesum_from_list(X: pd.DataFrame):
        X = X.fillna(0)
        return [
            [
                prefix + name, 
                X[namelist].values.dot(dist)
            ]
        ]
    return _rangesum_from_list

def _fillna(X: np.ndarray) -> np.ndarray:
    return np.nan_to_num(X, copy=True, nan=0)

def array_divide(
    numerator: List[Tuple[str, np.ndarray]], 
    denominator: List[Tuple[str, np.ndarray]]
) -> List[Any]:
    assert len(numerator) == len(denominator)
    return [
        [
            "r" + numerator_colname, 
            _fillna(np.divide(numerator_col, denominator_col))
        ] for [numerator_colname, numerator_col], [_, denominator_col] in zip(numerator, denominator)
    ]

def one_hot_encode(column: str) -> pd.DataFrame:
    def _one_hot_encode(X: pd.DataFrame):
        X = X.fillna(0)
        df_dummies = pd.get_dummies(X[column], prefix=column)
        return [
            [colname, df_dummies[colname].values] for colname in df_dummies.columns
        ]
    return _one_hot_encode

def preprocess(X: pd.DataFrame, processors: List[Any]) -> pd.DataFrame:
    X_new = pd.DataFrame()

    for processor in processors:
        for colname, col in processor if type(processor) == type([]) else processor(X):
            X_new[colname] = col

    X_new = X_new.fillna(0)

#     X_new = pd.DataFrame(scaler.fit_transform(X_new), columns=X_new.columns)

    return X_new

def equal_dist(length: int) -> np.ndarray:
    return np.ones(length)

def linear_dist(length: int) -> np.ndarray:
    return np.arange(start=0, stop=1, step=1/length)

def triangle_dist(length: int) -> np.ndarray:
    return np.concatenate(
        [
            np.arange(start=0, stop=1, step=1/length),
            np.arange(start=1, stop=0, step=-1/length)
        ]
    )
# 62317
# 3595
# 85519
# 3461
# 63063
# 3548

Filtering outliers...
44655
2432
65185
2333
45549
2469
Defining preprocessors...


In [7]:
# Get columns of "c" prefix of last 5 days of month
last_5_days = [
    *[f"c202201{i + 27}" for i in range(5)],
    *[f"c202202{i + 24}" for i in range(5)],
    *[f"c202203{i + 27}" for i in range(5)],
    *[f"c202204{i + 26}" for i in range(5)],
    *[f"c202205{i + 27}" for i in range(5)],
    *[f"c202206{i + 26}" for i in range(5)],
    *[f"c202207{i + 27}" for i in range(5)],
]
last_5_days_sum = X_model.filter(last_5_days, axis=1).fillna(0).sum(axis=1)
last5_code = np.array(last_5_days_sum)
last_5_days_sum_list =[]
last_5_days_sum_list.append(['last_5_days_sum',last5_code])

In [8]:
print("Data preprocessing...")
dist_GIT = rangesum(
    'GIT', 
    r"202205[0-9]{2}", 
    "cts", 
    equal_dist(31)
)(X_model)
dist_VAT = rangesum(
    'VAT', 
    r"20220[17](?:[01][0-9]|2[0-5])", 
    "ts", 
    np.concatenate((equal_dist(25), equal_dist(25)))
)(X_model)
entire_days = 31 + 29 + 31 + 30 + 31 + 30 + 31 + 25
entire = rangesum(
    'Entire', 
    r"2022[0-9]{4}", 
    "cts", 
    equal_dist(entire_days)
)(X_model)

age_code = np.array(X_model['age_code'])
gender_code = np.array(X_model['gender'])
region_code = np.array(X_model['region_code'])
cat_Featrues = []
cat_Featrues.append(['gender',age_code])
cat_Featrues.append(['age_code',age_code])
cat_Featrues.append(['region_code',region_code])

X_processed = preprocess(
    X_model, 
    [
        cat_Featrues,
        dist_GIT,
        dist_VAT,
        entire,
        last_5_days_sum_list
        # array_divide(dist_GIT, entire), # rel_GIT
        # array_divide(dist_VAT, entire[1:]), # rel_VAT
    ]
)
X_processed.head(3)

Data preprocessing...


Unnamed: 0,gender,age_code,region_code,cGIT,tGIT,sGIT,tVAT,sVAT,cEntire,tEntire,sEntire,last_5_days_sum
0,13,13,7,0.0,0.0,0.0,0.0,0.0,1.0,1.0,93.0,0.0
1,5,5,1,2.0,0.0,17.0,0.0,185.0,39.0,0.0,790.0,4.0
2,6,6,2,6.0,3.0,2253.0,0.0,1712.0,26.0,3.0,5119.0,0.0


In [9]:
print("Preparing for hyperparameter tuning...")
def _construct_and_cross_validate(**kwargs):

    lgbm = LGBMClassifier(
        task = "train",
        objective = "binary", #cross-entropy
        metric = "auc",
        tree_learner = "data",
        random_state=100,
        categorical_feature = [0,1,2],
        class_weight={0: 1, 1: 14.291397},
        n_estimators=kwargs['n_estimators'],
        # to deal with overfitting, very important param
        max_depth=kwargs['max_depth'],
        learning_rate=kwargs['learning_rate'],
        num_leaves=kwargs['num_leaves'],
        min_data_in_leaf=kwargs['min_data_in_leaf'],
        #if max_bin becomes small, the accuracy goes up
        max_bin=kwargs['max_bin'],
        lambda_l1=kwargs['lambda_l1'],
        lambda_l2=kwargs['lambda_l2'],
        # to deal with overfitting
        min_child_weight=kwargs['min_child_weight'],
        #for bagging imbalanced
        bagging_fraction=kwargs['bagging_fraction'],
        pos_bagging_fraction=kwargs['pos_bagging_fraction'],
        neg_bagging_fraction=kwargs['neg_bagging_fraction'],
    )
    #cross validation K=5
    scores = cross_val_score(
        lgbm, 
        X_processed, 
        Y_model, 
        cv=StratifiedKFold(n_splits=5, shuffle=True),
        scoring="roc_auc"
    )
    return scores

# Task: Hyperparameter tuning with Optuna
def objective(trial: Trial):
    # Construct a DecisionTreeClassifier object
    scores = _construct_and_cross_validate(
        n_estimators=trial.suggest_int('n_estimators',100,500),
        # to deal with overfitting, very important param
        max_depth = trial.suggest_int('max_depth',10,20),
        learning_rate = trial.suggest_float('learning_rate',0.02,0.1),
        num_leaves = trial.suggest_int('num_leaves',500,1000),
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf',100,1000),
        #if max_bin becomes small, the accuracy goes up
        max_bin = trial.suggest_int('max_bin',255,350),
        lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-3, 10.0),
        lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-3, 10.0),
        # to deal with overfitting
        min_child_weight = trial.suggest_int('min_child_weight', 1, 10),
        #for bagging imbalanced
        bagging_fraction = trial.suggest_float('bagging_fraction', 0,1),
        pos_bagging_fraction = trial.suggest_float('pos_bagging_fraction', 0,1),
        neg_bagging_fraction = trial.suggest_float('neg_bagging_fraction', 0,1),
    )

    return scores.mean()

print("Hyperparameter tuning started...")
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# Print the best parameters
print("Best params")
print(study.best_params)

Preparing for hyperparameter tuning...
Hyperparameter tuning started...






Best params
{'n_estimators': 222, 'max_depth': 10, 'learning_rate': 0.041435226343818604, 'num_leaves': 932, 'min_data_in_leaf': 522, 'max_bin': 317, 'lambda_l1': 0.010815978738787315, 'lambda_l2': 2.522863566413219, 'min_child_weight': 10, 'bagging_fraction': 0.23684876647852016, 'pos_bagging_fraction': 0.23130127933103428, 'neg_bagging_fraction': 0.9451474530443466}


In [10]:
print("Finalizing model...")
scores = _construct_and_cross_validate(
    n_estimators=study.best_params['n_estimators'],
    # to deal with overfitting, very important param
    max_depth=study.best_params['max_depth'],
    learning_rate=study.best_params['learning_rate'],
    num_leaves=study.best_params['num_leaves'],
    min_data_in_leaf=study.best_params['min_data_in_leaf'],
    #if max_bin becomes small, the accuracy goes up
    max_bin=study.best_params['max_bin'],
    lambda_l1=study.best_params['lambda_l1'],
    lambda_l2=study.best_params['lambda_l2'],
    # to deal with overfitting
    min_child_weight=study.best_params['min_child_weight'],
    #for bagging imbalanced
    bagging_fraction=study.best_params['bagging_fraction'],
    pos_bagging_fraction=study.best_params['pos_bagging_fraction'],
    neg_bagging_fraction=study.best_params['neg_bagging_fraction'],
)

print("Average ROC AUC Score", np.mean(scores))
print("Standard Deviation of ROC AUC Score", np.std(scores))


Finalizing model...
Average ROC AUC Score 0.9282811241682133
Standard Deviation of ROC AUC Score 0.0014156586964495792


In [11]:
optimized_LGBM = LGBMClassifier(
    task = "predict",
    objective = "binary",
    metric = "auc",
    tree_learner = "data",
    categorical_feature = [0,1,2],
    class_weight={0: 1, 1: 14.291397},
    n_estimators=study.best_params['n_estimators'],
    
    # to deal with overfitting, very important param
    max_depth=study.best_params['max_depth'],
    learning_rate=study.best_params['learning_rate'],
    num_leaves=study.best_params['num_leaves'],
    min_data_in_leaf=study.best_params['min_data_in_leaf'],
    
    #if max_bin becomes small, the accuracy goes up
    max_bin=study.best_params['max_bin'],
    lambda_l1=study.best_params['lambda_l1'],
    lambda_l2=study.best_params['lambda_l2'],
    
    # to deal with overfitting
    min_child_weight=study.best_params['min_child_weight'],
    #for bagging imbalanced
    bagging_fraction=study.best_params['bagging_fraction'],
    pos_bagging_fraction=study.best_params['pos_bagging_fraction'],
    neg_bagging_fraction=study.best_params['neg_bagging_fraction'],
)

In [10]:
print("Executing...")
Y_exam = np.zeros(X_exam_processed.shape[0])
k = 5
kf = StratifiedKFold(n_splits=k)
for tr_index, val_index in kf.split(X_processed,Y_model):
    X_tr,Y_tr = X_processed.iloc[tr_index],Y_model.iloc[tr_index]
    X_val, Y_val = X_processed.iloc[val_index],Y_model.iloc[val_index]
    
    optimized_LGBM.fit(X_tr,Y_tr,eval_metric='auc')
    proba = optimized_LGBM.predict_proba(X_exam_processed)[:,1]
    Y_exam = Y_exam + proba
Y_exam = Y_exam/k
thresholds = np.array([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])
# the ratio of high prob with different thresholds
for num in thresholds: 
    filtered = Y_exam[np.where(Y_exam>=num)]
    print("the number of probability more than %.2f is %d:" %(num,len(filtered)))
    print("the ratio of probability more than %.2f is : %.4f"%(num, float(len(filtered))/len(Y_exam)))
    print('---------------------------------------------------\n')
# res = pd.DataFrame({'business prob':Y_exam})
# res.to_csv("./part1.csv")


Executing...
[1]	training's auc: 0.921668	valid_1's auc: 0.920071
[2]	training's auc: 0.921888	valid_1's auc: 0.920536
[3]	training's auc: 0.922567	valid_1's auc: 0.921094
[4]	training's auc: 0.922614	valid_1's auc: 0.921126
[5]	training's auc: 0.922796	valid_1's auc: 0.92131
[6]	training's auc: 0.923726	valid_1's auc: 0.922375
[7]	training's auc: 0.924201	valid_1's auc: 0.922671
[8]	training's auc: 0.924369	valid_1's auc: 0.923
[9]	training's auc: 0.924404	valid_1's auc: 0.923025
[10]	training's auc: 0.924763	valid_1's auc: 0.923265
[11]	training's auc: 0.924955	valid_1's auc: 0.9234
[12]	training's auc: 0.925009	valid_1's auc: 0.923374
[13]	training's auc: 0.925087	valid_1's auc: 0.923423
[14]	training's auc: 0.925072	valid_1's auc: 0.923343
[15]	training's auc: 0.925155	valid_1's auc: 0.923318
[16]	training's auc: 0.925186	valid_1's auc: 0.923392
[17]	training's auc: 0.92516	valid_1's auc: 0.923369
[18]	training's auc: 0.92526	valid_1's auc: 0.923424
[19]	training's auc: 0.92546	val

[143]	training's auc: 0.93198	valid_1's auc: 0.927778
[144]	training's auc: 0.932009	valid_1's auc: 0.927781
[145]	training's auc: 0.932026	valid_1's auc: 0.927791
[146]	training's auc: 0.932047	valid_1's auc: 0.927787
[147]	training's auc: 0.932078	valid_1's auc: 0.927803
[148]	training's auc: 0.932102	valid_1's auc: 0.927798
[149]	training's auc: 0.932123	valid_1's auc: 0.927796
[150]	training's auc: 0.93214	valid_1's auc: 0.927801
[151]	training's auc: 0.932167	valid_1's auc: 0.927815
[152]	training's auc: 0.932211	valid_1's auc: 0.927815
[153]	training's auc: 0.932234	valid_1's auc: 0.927814
[154]	training's auc: 0.932275	valid_1's auc: 0.927819
[155]	training's auc: 0.932311	valid_1's auc: 0.927833
[156]	training's auc: 0.932356	valid_1's auc: 0.927853
[157]	training's auc: 0.932391	valid_1's auc: 0.927855
[158]	training's auc: 0.932445	valid_1's auc: 0.927868
[159]	training's auc: 0.932481	valid_1's auc: 0.927891
[160]	training's auc: 0.932528	valid_1's auc: 0.927892
[161]	traini

[23]	training's auc: 0.926529	valid_1's auc: 0.924118
[24]	training's auc: 0.926636	valid_1's auc: 0.9242
[25]	training's auc: 0.926729	valid_1's auc: 0.924281
[26]	training's auc: 0.926788	valid_1's auc: 0.924368
[27]	training's auc: 0.926828	valid_1's auc: 0.924373
[28]	training's auc: 0.926872	valid_1's auc: 0.92437
[29]	training's auc: 0.926908	valid_1's auc: 0.924373
[30]	training's auc: 0.926989	valid_1's auc: 0.924445
[31]	training's auc: 0.927046	valid_1's auc: 0.924433
[32]	training's auc: 0.927093	valid_1's auc: 0.924467
[33]	training's auc: 0.927122	valid_1's auc: 0.924468
[34]	training's auc: 0.927248	valid_1's auc: 0.92454
[35]	training's auc: 0.927279	valid_1's auc: 0.924562
[36]	training's auc: 0.927306	valid_1's auc: 0.924582
[37]	training's auc: 0.92731	valid_1's auc: 0.924574
[38]	training's auc: 0.927381	valid_1's auc: 0.924614
[39]	training's auc: 0.927411	valid_1's auc: 0.924605
[40]	training's auc: 0.927424	valid_1's auc: 0.9246
[41]	training's auc: 0.927457	valid

[175]	training's auc: 0.933324	valid_1's auc: 0.927425
[176]	training's auc: 0.933356	valid_1's auc: 0.927438
[177]	training's auc: 0.933401	valid_1's auc: 0.927445
[178]	training's auc: 0.933465	valid_1's auc: 0.927467
[179]	training's auc: 0.933529	valid_1's auc: 0.927459
[180]	training's auc: 0.93357	valid_1's auc: 0.927465
[181]	training's auc: 0.933642	valid_1's auc: 0.927476
[182]	training's auc: 0.933724	valid_1's auc: 0.927478
[183]	training's auc: 0.933831	valid_1's auc: 0.927495
[184]	training's auc: 0.93389	valid_1's auc: 0.927494
[185]	training's auc: 0.933933	valid_1's auc: 0.927503
[186]	training's auc: 0.934019	valid_1's auc: 0.927486
[187]	training's auc: 0.934053	valid_1's auc: 0.927486
[188]	training's auc: 0.934081	valid_1's auc: 0.92749
[189]	training's auc: 0.934212	valid_1's auc: 0.927486
[190]	training's auc: 0.934248	valid_1's auc: 0.927495
[191]	training's auc: 0.934297	valid_1's auc: 0.927501
[192]	training's auc: 0.934336	valid_1's auc: 0.927514
[193]	trainin

[55]	training's auc: 0.928132	valid_1's auc: 0.92565
[56]	training's auc: 0.928164	valid_1's auc: 0.925676
[57]	training's auc: 0.92822	valid_1's auc: 0.925748
[58]	training's auc: 0.928263	valid_1's auc: 0.925786
[59]	training's auc: 0.928368	valid_1's auc: 0.92588
[60]	training's auc: 0.928391	valid_1's auc: 0.925892
[61]	training's auc: 0.928426	valid_1's auc: 0.925914
[62]	training's auc: 0.928461	valid_1's auc: 0.925945
[63]	training's auc: 0.928538	valid_1's auc: 0.926
[64]	training's auc: 0.928568	valid_1's auc: 0.926025
[65]	training's auc: 0.928652	valid_1's auc: 0.92608
[66]	training's auc: 0.928673	valid_1's auc: 0.926088
[67]	training's auc: 0.928723	valid_1's auc: 0.926122
[68]	training's auc: 0.928784	valid_1's auc: 0.926175
[69]	training's auc: 0.928887	valid_1's auc: 0.926234
[70]	training's auc: 0.928911	valid_1's auc: 0.926243
[71]	training's auc: 0.928965	valid_1's auc: 0.926277
[72]	training's auc: 0.929025	valid_1's auc: 0.926307
[73]	training's auc: 0.929071	valid

[207]	training's auc: 0.93468	valid_1's auc: 0.92742
[208]	training's auc: 0.934783	valid_1's auc: 0.927424
[209]	training's auc: 0.93488	valid_1's auc: 0.927429
[210]	training's auc: 0.934905	valid_1's auc: 0.927433
[211]	training's auc: 0.934923	valid_1's auc: 0.927429
[212]	training's auc: 0.935006	valid_1's auc: 0.927444
[213]	training's auc: 0.935075	valid_1's auc: 0.92744
[214]	training's auc: 0.935093	valid_1's auc: 0.92743
[215]	training's auc: 0.9351	valid_1's auc: 0.927433
[216]	training's auc: 0.935205	valid_1's auc: 0.927432
[217]	training's auc: 0.935246	valid_1's auc: 0.927427
[218]	training's auc: 0.935263	valid_1's auc: 0.92743
[219]	training's auc: 0.935329	valid_1's auc: 0.927417
[220]	training's auc: 0.935434	valid_1's auc: 0.927421
[221]	training's auc: 0.935453	valid_1's auc: 0.927412
[222]	training's auc: 0.935512	valid_1's auc: 0.927408
[223]	training's auc: 0.935563	valid_1's auc: 0.927411
[224]	training's auc: 0.935583	valid_1's auc: 0.927411
[225]	training's a

[88]	training's auc: 0.929967	valid_1's auc: 0.9264
[89]	training's auc: 0.930007	valid_1's auc: 0.926416
[90]	training's auc: 0.930046	valid_1's auc: 0.926431
[91]	training's auc: 0.930086	valid_1's auc: 0.926432
[92]	training's auc: 0.930157	valid_1's auc: 0.926472
[93]	training's auc: 0.930218	valid_1's auc: 0.926482
[94]	training's auc: 0.930258	valid_1's auc: 0.926504
[95]	training's auc: 0.930308	valid_1's auc: 0.926524
[96]	training's auc: 0.930338	valid_1's auc: 0.926542
[97]	training's auc: 0.930401	valid_1's auc: 0.926568
[98]	training's auc: 0.930505	valid_1's auc: 0.926625
[99]	training's auc: 0.930538	valid_1's auc: 0.926655
[100]	training's auc: 0.930614	valid_1's auc: 0.926691
[101]	training's auc: 0.930669	valid_1's auc: 0.926711
[102]	training's auc: 0.930731	valid_1's auc: 0.92676
[103]	training's auc: 0.930785	valid_1's auc: 0.926763
[104]	training's auc: 0.930796	valid_1's auc: 0.926765
[105]	training's auc: 0.930808	valid_1's auc: 0.926767
[106]	training's auc: 0.9

[238]	training's auc: 0.9366	valid_1's auc: 0.927404
[239]	training's auc: 0.936623	valid_1's auc: 0.927404
[240]	training's auc: 0.936664	valid_1's auc: 0.927404
[241]	training's auc: 0.936718	valid_1's auc: 0.927404
[242]	training's auc: 0.936744	valid_1's auc: 0.927396
[243]	training's auc: 0.936826	valid_1's auc: 0.927407
[244]	training's auc: 0.936847	valid_1's auc: 0.927399
[245]	training's auc: 0.936947	valid_1's auc: 0.927381
[246]	training's auc: 0.936974	valid_1's auc: 0.927369
[247]	training's auc: 0.937001	valid_1's auc: 0.927364
[248]	training's auc: 0.937013	valid_1's auc: 0.92736
[249]	training's auc: 0.937036	valid_1's auc: 0.927352
[250]	training's auc: 0.937054	valid_1's auc: 0.927352
[251]	training's auc: 0.937094	valid_1's auc: 0.927342
[252]	training's auc: 0.937155	valid_1's auc: 0.927356
[253]	training's auc: 0.937199	valid_1's auc: 0.927342
[254]	training's auc: 0.937223	valid_1's auc: 0.927343
[255]	training's auc: 0.937259	valid_1's auc: 0.92734
[256]	training

[119]	training's auc: 0.930676	valid_1's auc: 0.929628
[120]	training's auc: 0.930706	valid_1's auc: 0.929633
[121]	training's auc: 0.93076	valid_1's auc: 0.92964
[122]	training's auc: 0.930805	valid_1's auc: 0.929655
[123]	training's auc: 0.930846	valid_1's auc: 0.929671
[124]	training's auc: 0.930917	valid_1's auc: 0.929694
[125]	training's auc: 0.930987	valid_1's auc: 0.929725
[126]	training's auc: 0.931029	valid_1's auc: 0.929729
[127]	training's auc: 0.931072	valid_1's auc: 0.929718
[128]	training's auc: 0.931115	valid_1's auc: 0.929733
[129]	training's auc: 0.931136	valid_1's auc: 0.929744
[130]	training's auc: 0.931146	valid_1's auc: 0.92975
[131]	training's auc: 0.931193	valid_1's auc: 0.929788
[132]	training's auc: 0.931197	valid_1's auc: 0.929788
[133]	training's auc: 0.931238	valid_1's auc: 0.929809
[134]	training's auc: 0.931247	valid_1's auc: 0.929816
[135]	training's auc: 0.931312	valid_1's auc: 0.929839
[136]	training's auc: 0.931348	valid_1's auc: 0.929845
[137]	trainin

In [13]:
from sklearn.metrics import accuracy_score
# Y_exam = np.zeros(X_exam_processed.shape[0])

accuracy_Arr = np.array([])
popup_Arr = np.array([])
survey_Arr = np.array([])

k = 5
kf = StratifiedKFold(n_splits=k)
thresholds = [0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.92,0.93,0.94,0.95]
for th in thresholds:
    popup_P = 0
    survey_P = 0
    accuracy = 0
    for tr_index, val_index in kf.split(X_processed,Y_model):
        X_tr,Y_tr = X_processed.iloc[tr_index],Y_model.iloc[tr_index]
        X_val, Y_val = X_processed.iloc[val_index],Y_model.iloc[val_index]

        optimized_LGBM.fit(X_tr,Y_tr,eval_metric='auc')
        # Generate the columns
        proba = optimized_LGBM.predict_proba(X_val)[:,1]
        pred = pd.DataFrame({'business prob_pred':proba})
        popup = pd.DataFrame({'popup':np.zeros(X_val.shape[0])})
        survey = pd.DataFrame({'survey':np.zeros(X_val.shape[0])})
#         login = pd.DataFrame({'login':np.zeros(X_val.shape[0])})
        
        # Merge
        res = pd.concat([pred,popup],axis=1)
        res = pd.concat([res,survey],axis=1)
#         res = pd.concat([res,login],axis=1)
        
        # If predicted proba is more than th, put his popup as 1
        res.loc[res['business prob_pred'] >= th, 'popup'] = 1
        res.loc[ (res['business prob_pred'] >= th)&(X_val['last_5_days_sum']>0) , 'survey'] = 1
        
        accuracy += accuracy_score(Y_val, res['popup'])
        res = pd.concat([res,Y_val.reset_index(drop=True)],axis=1) # Merge res with Y_val
        popup_P += res.loc[res['popup']==1,'business'].sum()*(500000*0.01) - res['popup'].sum()*400
        survey_P += res.loc[res['survey']==1,'business'].sum()*(500000*0.036) - res['survey'].sum()*5000*0.18
        
    accuracy_Arr = np.append(accuracy_Arr,accuracy/k)
    popup_Arr = np.append(popup_Arr,popup_P/k)
    survey_Arr = np.append(survey_Arr,survey_P/k)
    

#     print('---------------------------------')
#     print(accuracy_score(Y_val, res['popup']))
#     print('---------------------------------')
#     print('net profit:')
#     profit = res.loc[res['popup']==1,'business'].sum()*(500000*0.01) - res['popup'].sum()*400
#     print(profit)
#     print('---------------------------------')










In [14]:
df = pd.DataFrame({'threshold':thresholds, 'avg_Accuracy':accuracy_Arr, 'avg_Popup_Profit':popup_Arr, 'avg_Survey_Porfit':survey_Arr})
df.set_index('threshold')
print(df)
#    threshold  avg_Accuracy  avg_Popup_Profit  avg_Survey_Porfit
# 0        0.0      0.068291        -8240960.0         46341540.0
# 1        0.1      0.640494        23696040.0        117750240.0
# 2        0.2      0.682034        25660280.0        121567680.0
# 3        0.3      0.713838        27021680.0        123940980.0
# 4        0.4      0.747154        28083480.0        125021880.0
# 5        0.5      0.782841        28742040.0        124332840.0
# 6        0.6      0.821378        28614400.0        120353400.0
# 7        0.7      0.882643        25932680.0        104465880.0
# 8        0.8      0.950920        18802240.0         70784640.0
# 9        0.9      0.957690        17100240.0         63607140.0

    threshold  avg_Accuracy  avg_Popup_Profit  avg_Survey_Porfit
0        0.30      0.712170        26961360.0         24250860.0
1        0.40      0.746003        28053960.0         24463440.0
2        0.50      0.782204        28767520.0         24416820.0
3        0.60      0.819901        28647200.0         23722740.0
4        0.70      0.881643        26022520.0         20329560.0
5        0.80      0.952078        18627200.0         13962420.0
6        0.90      0.957921        17053560.0         12749220.0
7        0.92      0.957950        17013240.0         12725100.0
8        0.93      0.957959        17004480.0         12718800.0
9        0.94      0.957946        16991160.0         12715560.0
10       0.95      0.957945        16986880.0         12712140.0


In [15]:
X_exam = pd.read_csv('X_exam.csv')
print("Data preprocessing...")
dist_GIT_exam = rangesum(
    'GIT', 
    r"202205[0-9]{2}", 
    "cts", 
    equal_dist(31)
)(X_exam)
dist_VAT_exam = rangesum(
    'VAT', 
    r"20220[17](?:[01][0-9]|2[0-5])", 
    "ts", 
    np.concatenate((equal_dist(25), equal_dist(25)))
)(X_exam)
entire_days = 31 + 29 + 31 + 30 + 31 + 30 + 31 + 25
entire_exam = rangesum(
    'Entire', 
    r"2022[0-9]{4}", 
    "cts", 
    equal_dist(entire_days)
)(X_exam)

age_code_exam = np.array(X_exam['age_code'])
gender_code_exam = np.array(X_exam['gender'])
region_code_exam = np.array(X_exam['region_code'])
cat_Featrues_exam = []
cat_Featrues_exam.append(['gender',age_code_exam])
cat_Featrues_exam.append(['age_code',age_code_exam])
cat_Featrues_exam.append(['region_code',region_code_exam])

# Get columns of "c" prefix of last 5 days of month
last_5_days = [
    *[f"c202201{i + 27}" for i in range(5)],
    *[f"c202202{i + 24}" for i in range(5)],
    *[f"c202203{i + 27}" for i in range(5)],
    *[f"c202204{i + 26}" for i in range(5)],
    *[f"c202205{i + 27}" for i in range(5)],
    *[f"c202206{i + 26}" for i in range(5)],
    *[f"c202207{i + 27}" for i in range(5)],
]
last_5_days_sum_exam = X_exam.filter(last_5_days, axis=1).fillna(0).sum(axis=1)
last_5_days_sum_exam = X_exam.filter(last_5_days, axis=1).fillna(0).sum(axis=1)
last5_code_exam = np.array(last_5_days_sum_exam)
last_5_days_sum_list_exam =[]
last_5_days_sum_list_exam.append(['last_5_days_sum',last5_code_exam])

X_exam_processed = preprocess(
    X_exam, 
    [
        cat_Featrues_exam,
        dist_GIT_exam,
        dist_VAT_exam,
        entire_exam,
        last_5_days_sum_list_exam
    ]
)

Data preprocessing...


In [16]:
optimized_LGBM.fit(X_processed,Y_model,eval_metric='auc')
proba = optimized_LGBM.predict_proba(X_exam_processed)[:,1]



In [22]:
final = pd.DataFrame({'business_prob':proba})
P = pd.DataFrame({'popup':np.zeros(X_exam_processed.shape[0],dtype=int)})
S = pd.DataFrame({'survey':np.zeros(X_exam_processed.shape[0],dtype=int)})

final = pd.concat([final,P],axis=1)
final = pd.concat([final,S],axis=1)

final.loc[final['business_prob'] >= 0.5, 'popup'] = int(1)
final.loc[(final['business_prob'] >= 0.7) & (X_exam_processed['last_5_days_sum']>0), 'survey'] = int(1)
print(final.head())
final.to_csv("./submission.csv",index=False)
#    business_prob  popup  survey
# 0       0.440112      0       0
# 1       0.002921      0       0
# 2       0.485339      0       0
# 3       0.004235      0       0
# 4       0.037652      0       0

   business_prob  popup  survey
0       0.440112      0       0
1       0.002921      0       0
2       0.485339      0       0
3       0.004235      0       0
4       0.037652      0       0


In [23]:
final['survey'].sum()

48479