# 1. Library & Input data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/My Drive/kalapa

/content/drive/My Drive/kalapa


In [3]:
!pip install utils

Collecting utils
  Downloading https://files.pythonhosted.org/packages/55/e6/c2d2b2703e7debc8b501caae0e6f7ead148fd0faa3c8131292a599930029/utils-1.0.1-py2.py3-none-any.whl
Installing collected packages: utils
Successfully installed utils-1.0.1


In [4]:
import os, sys, re
import numpy as np
import pandas as pd
import matplotlib
import pickle
import tarfile
from datetime import datetime
from subprocess import call, Popen
from scipy import interp
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve
import matplotlib.pyplot as plt
from utils import *

In [5]:
matplotlib.use("pdf")

# 2. Feature Engineering

In [6]:
INPUT_DIR = 'data'

train_df = pd.read_csv('/content/drive/My Drive/kalapa/dataset/train.csv')
test_df = pd.read_csv('/content/drive/My Drive/kalapa/dataset/test.csv')
print("Train: ", train_df.shape, " | Test: ", test_df.shape)

print("-"*60)

  interactivity=interactivity, compiler=compiler, result=result)


Train:  (53030, 195)  | Test:  (20381, 194)
------------------------------------------------------------


  interactivity=interactivity, compiler=compiler, result=result)


### 2.1. Datetime columns

In [7]:
print("\n@Data Pre-Processing")

print("\n+ Feature Engineering")

#train_df = age_group(train_df)
#test_df = age_group(test_df)

#train_df = field7_count(train_df)
#test_df = field7_count(test_df)

print("train_df.shape = ", train_df.shape, " | test_df.shape = ", test_df.shape)


@Data Pre-Processing

+ Feature Engineering
train_df.shape =  (53030, 195)  | test_df.shape =  (20381, 194)


### 2.2. Categorical columns

In [8]:
# WOE Binning
bin_num_limit = 8
stop_limit = 0.05
count_distr_limit = 0.05

woe_cols = [
    'Field_1', 'Field_10', 'Field_11', 'Field_12', 'Field_13', 'Field_14',
    'Field_15', 'Field_16', 'Field_17', 'Field_18', 'Field_19', 'Field_2',
    'Field_20', 'Field_21', 'Field_22', 'Field_23', 'Field_24', 'Field_25',
    'Field_26', 'Field_27', 'Field_28', 'Field_29', 'Field_3', 'Field_30',
    'Field_31', 'Field_33', 'Field_34', 'Field_35', 'Field_36', 'Field_37', 
    'Field_38', 'Field_39', 'Field_4', 'Field_40', 'Field_41', 'Field_42', 
    'Field_43', 'Field_44', 'Field_45', 'Field_46', 'Field_47', 'Field_48', 
    'Field_49', 'Field_5', 'Field_50', 'Field_51', 'Field_52', 'Field_53', 
    'Field_54', 'Field_55', 'Field_56', 'Field_57', 'Field_6', 
    'Field_8', 'Field_9'
]
commands = []
for col_name in woe_cols:
    os.makedirs(os.path.join(INPUT_DIR, "woe/%s"%col_name), exist_ok=True)
    df1 = train_df[["id", "label", col_name]]
    df2 = test_df[["id", col_name]]
    df1.to_csv(os.path.join(INPUT_DIR, "woe/%s/train.csv"%col_name), index=False, encoding="utf-8")
    df2.to_csv(os.path.join(INPUT_DIR, "woe/%s/test.csv"%col_name), index=False, encoding="utf-8")

    print(f"Binning: {col_name}", f"bin_num_limit={bin_num_limit}", f"stop_limit={stop_limit}", f"count_distr_limit={count_distr_limit}")
    commands.append(
        f"Rscript woe.r {INPUT_DIR}/woe {col_name} {bin_num_limit} {stop_limit} {count_distr_limit}"
    )

procs = [Popen(c.strip().split()) for c in commands]
for p in procs:
    p.wait()
    
train_woe_df = {}
test_woe_df = {}
for col_name in woe_cols:
    df1 = pd.read_csv(os.path.join(INPUT_DIR, 'woe/%s/train_woe.csv'%col_name))
    df2 = pd.read_csv(os.path.join(INPUT_DIR, 'woe/%s/test_woe.csv'%col_name))
    for c in df1.columns:
        train_woe_df[c] = df1[c]
    for c in df2.columns:
        test_woe_df[c] = df2[c]

train_fe_df = pd.DataFrame.from_dict(train_woe_df)
test_fe_df = pd.DataFrame.from_dict(test_woe_df)

print(" "*4, "After dropping: ", train_fe_df.shape, test_fe_df.shape)
print("-"*40)

print("Done!")
print("-"*60)

train_fe_df.to_csv(os.path.join(INPUT_DIR, "train_fe.csv"), index=False, encoding="utf-8")
test_fe_df.to_csv(os.path.join(INPUT_DIR, "test_fe.csv"), index=False, encoding="utf-8")


Binning: Field_1 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_10 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_11 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_12 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_13 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_14 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_15 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_16 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_17 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_18 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_19 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_2 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_20 bin_num_limit=8 stop_limit=0.05 count_distr_limit=0.05
Binning: Field_21 bin_num_limit=8 stop_limit=0.05 cou

FileNotFoundError: ignored

### 2.3. Others

In [None]:
# 4. Data Spliting
print("\n@Learning")
n_folds = 5
seed = 2020
    
print("+ Data Splitting")

train_fe_df.label.replace("Good", 0, inplace=True)
train_fe_df.label.replace("Bad", 1, inplace=True)
print(f"Stratified {n_folds}-fold, seed={seed}")
y = train_fe_df["label"].values
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
for i, (train, val) in enumerate(cv.split(np.zeros(len(y)), y)):
    print("FOLD %d" % (i + 1))
    os.makedirs(os.path.join(INPUT_DIR, "fold%d" % i), exist_ok=True)
    train_df, val_df = train_fe_df.loc[train], train_fe_df.loc[val]
    # use all positive examples for training and evaluation
    train_df = pd.concat([train_df, val_df[val_df.label == 1]])
    val_df = pd.concat([val_df, train_df[train_df.label == 1]])
    train_df.to_csv(os.path.join(INPUT_DIR, "fold%d/train.csv" % i), index=False)
    val_df.to_csv(os.path.join(INPUT_DIR, "fold%d/val.csv" % i), index=False)

print("-"*50)

### 2.4. Combine all parts

In [None]:
n_trees = 767
max_depth = 17
min_samples_split = 2
min_samples_leaf = 1 
max_features = 'auto'
class_weight = None
bootstrap = True
n_folds = 5

embeddings = pd.read_pickle("./data/embeddings.pkl").to_numpy(dtype=np.float32)

# submission input
X_submit = pd.read_csv(os.path.join(INPUT_DIR, "test_fe.csv"))
submit_id = X_submit.id.to_numpy(int)
submit_dict = {"id": submit_id}
X_submit.drop(columns=["id"], inplace=True)
X_submit = X_submit.to_numpy(dtype=np.float32)
X_submit = np.concatenate([X_submit, embeddings[submit_id]], axis=1)
print(X_submit.shape)

# training and evaluation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fold_aucs = []

fig, ax = plt.subplots()
for i in range(n_folds):
    print("FOLD %d" % (i + 1))

    train_df = pd.read_csv(os.path.join(INPUT_DIR, "fold%d/train.csv" % i))
    val_df = pd.read_csv(os.path.join(INPUT_DIR, "fold%d/val.csv" % i))
    train_id = train_df.id.to_numpy(int)
    val_id = val_df.id.to_numpy(int)
    train_df.drop(columns=["id"], inplace=True)
    val_df.drop(columns=["id"], inplace=True)
    
    y_train = train_df["label"].to_numpy(dtype=np.float32)
    X_train = train_df.drop(columns=["label"]).to_numpy(dtype=np.float32)
    X_train = np.concatenate([X_train, embeddings[train_id]], axis=1)
    y_val = val_df["label"].to_numpy(dtype=np.float32)
    X_val = val_df.drop(columns=["label"]).to_numpy(dtype=np.float32)
    X_val = np.concatenate([X_val, embeddings[val_id]], axis=1)
    print(X_train.shape, X_val.shape)

  
    clf = RandomForestClassifier(
        n_estimators=n_trees,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=seed,
        class_weight=class_weight,
        bootstrap=True,
        n_jobs=1
    )

    clf.fit(X_train, y_train)

    auc_ = roc_auc_score(y_val, clf.predict_proba(X_val)[:, 1])
    fold_aucs.append(auc_)
    print(f"val AUC = {auc_:.4f}")

    y_submit = clf.predict_proba(X_submit)[:, 1]
    submit_dict["fold%d" % i] = y_submit

    viz = plot_roc_curve(
        clf, X_val, y_val, name=f"ROC Fold {i}", alpha=0.3, lw=1, ax=ax
    )
    interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Random", alpha=0.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="ROC Curves")
ax.legend(loc="lower right")
plt.savefig("roc.png")

print("Mean AUC = %0.4f, GINI %0.4f" % (mean_auc, 2 * mean_auc - 1.0))

print("\t".join(f"{x:.4f}" for x in fold_aucs))


# averaging for submission
res_df = pd.DataFrame(submit_dict)
res_df["label"] = res_df[["fold%d" % i for i in range(n_folds)]].mean(axis=1)
res_df[["id", "label"]].to_csv("submission.csv", index=False)

for i in range(n_folds):
    res_df["label"] = res_df[f"fold{i}"]
    res_df[["id", "label"]].to_csv("submission_fold%d.csv" % i, index=False)


### 2.5. Try Count Encoding

In [None]:
print("\n@Rules")

sub_df = pd.read_csv("submission_fold1.csv")

# Smoothing
y = sub_df.label.to_numpy()
rank = np.argsort(y)
y_smooth = np.arange(len(rank)) * (1.0 / (len(rank) - 1))
y[rank] = y_smooth
sub_df.label = y

rule_df = pd.read_csv("./rules.csv", dtype=str, encoding="utf-8")
test_df = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"), dtype=str, encoding="utf-8")

mask = np.ones(sub_df.shape[0])

for col in rule_df.columns:
    patterns = set(str_normalize(v) for v in rule_df[col].unique())
    patterns -= set(['nan'])
    if len(patterns) == 0:
        continue
    
    col_mask = test_df[col].apply(lambda x: 0. if str_normalize(x) in patterns else 1.)    
    mask *= col_mask.to_numpy()
    
# Smoothing
y = sub_df.label.to_numpy()
org_idx = np.argwhere(mask).ravel()
y_masked = y[org_idx]
rank = np.argsort(y_masked)
y_smooth = np.arange(1, len(rank) + 1) * (1.0 / (len(rank) + 1))
y_masked[rank] = y_smooth
y[org_idx] = y_masked

sub_df.label = y * mask
sub_df.to_csv("final_submission.csv", index=False)

### 3.2. Feature importances

In [None]:
def display_importances(feature_importance_df_):

    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:20].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

display_importances(feature_importance_df_=feature_importance_df)

# 4. Submisison

In [None]:
test['label'] = pd.DataFrame(oof_preds).mean(axis=1).values
name = pd.Timestamp.now().strftime('%Y%m%d_%H_%M')
test[['id', 'label']].to_csv(f'{name}.csv', index=False)