In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/My Drive/kalapa

/content/drive/My Drive/kalapa


In [4]:
!pip install feature_engine
!pip install unidecode
!pip install category_encoders
!pip install catboost

Collecting feature_engine
  Downloading https://files.pythonhosted.org/packages/14/ed/5680bf401855b788f79cadc1298c210c5860eb5d54c4008cfa234b752ef1/feature_engine-0.6.1-py2.py3-none-any.whl
Collecting statsmodels>=0.11.1
[?25l  Downloading https://files.pythonhosted.org/packages/00/93/1b6882f92d94e491a3e3be101fc83934551eada261281980f3957246432f/statsmodels-0.12.0-cp36-cp36m-manylinux1_x86_64.whl (9.5MB)
[K     |████████████████████████████████| 9.5MB 15.4MB/s 
Installing collected packages: statsmodels, feature-engine
  Found existing installation: statsmodels 0.10.2
    Uninstalling statsmodels-0.10.2:
      Successfully uninstalled statsmodels-0.10.2
Successfully installed feature-engine-0.6.1 statsmodels-0.12.0
Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 6.7MB/s 
[?25hInstalling collect

In [9]:
import pandas as pd
import numpy as np

from datetime import datetime
from unidecode import unidecode
from itertools import combinations

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import category_encoders as ce

import re
import warnings
warnings.filterwarnings('ignore')

In [12]:
train = pd.read_csv("/content/drive/My Drive/kalapa/dataset/train.csv")
test = pd.read_csv("/content/drive/My Drive/kalapa/dataset/test.csv")

In [13]:
# Drop some columns are duplicated, with correlation = NaN
ignore_columns = (["gioiTinh", "info_social_sex", "ngaySinh", "namSinh"] + 
        [f"Field_{c}" for c in [14, 16, 17, 24, 26, 30, 31, 37, 52, 57]] + 
        ['partner0_K', 'partner0_L', 
         'partner1_B', 'partner1_D', 'partner1_E', 'partner1_F', 'partner1_K', 'partner1_L',
         'partner2_B', 'partner2_G', 'partner2_K', 'partner2_L',
         'partner3_B', 'partner3_C', 'partner3_F', 'partner3_G', 'partner3_H', 'partner3_K', 'partner3_L',
         *['partner4_' + i for i in 'ABCDEFGHK'],
         'partner5_B', 'partner5_C', 'partner5_H', 'partner5_K', 'partner5_L'])

# Some auto columns could make new better columns
all_auto_columns = list(set([c for c in train.columns if train[c].dtype in [np.int64, np.float64]])
                    .difference(ignore_columns + ['currentLocationLocationId', 'homeTownLocationId', 'label', 'id']))

auto_columns_1 = [c for c in all_auto_columns if 'Field_' in c]
auto_columns_2 = [c for c in all_auto_columns if 'partner' in c]
auto_columns_3 = [c for c in all_auto_columns if 'num' in c]
auto_columns_4 = [c for c in all_auto_columns if c not in auto_columns_1 + auto_columns_2 + auto_columns_3]
print(len(auto_columns_1), len(auto_columns_2), len(auto_columns_3), len(auto_columns_4), len(all_auto_columns))


37 27 12 11 87


In [14]:
date_cols = ["Field_{}".format(i) for i in [5, 6, 7, 8, 9, 11, 15, 25, 32, 33, 35, 40]]
datetime_cols = ["Field_{}".format(i) for i in [1, 2, 43, 44]]
correct_dt_cols = ['Field_34', 'ngaySinh']
cat_cols = date_cols + datetime_cols + correct_dt_cols

# Normalize Field_34, ngaySinh
def ngaysinh_34_normalize(s):
    if s != s: return np.nan
    try: s = int(s)
    except ValueError: s = s.split(" ")[0]
    return datetime.strptime(str(s)[:6], "%Y%m")

# Normalize datetime data
def datetime_normalize(s):
    if s != s: return np.nan
    s = s.split(".")[0]
    if s[-1] == "Z": s = s[:-1]
    return datetime.strptime(s, "%Y-%m-%dT%H:%M:%S")

# Normalize date data
def date_normalize(s):
    if s != s: return np.nan
    try: t = datetime.strptime(s, "%m/%d/%Y")
    except: t = datetime.strptime(s, "%Y-%m-%d")
    return t

def process_datetime_cols(df):
    df[datetime_cols] = df[datetime_cols].applymap(datetime_normalize)  
    df[date_cols] = df[date_cols].applymap(date_normalize)
    df[correct_dt_cols] = df[correct_dt_cols].applymap(ngaysinh_34_normalize)

    # Some delta columns
    for i, j in zip('43 1 2'.split(), '1 2 44'.split()): df[f'DT_{j}_{i}'] = (df[f'Field_{j}'] - df[f'Field_{i}']).dt.seconds
    for i, j in zip('5 6 7 33 8 11 9 15 25 6 7 8 9 15 25 2'.split(), '6 34 33 40 11 35 15 25 32 7 8 9 15 25 32 8'.split()): 
        df[f'DT_{j}_{i}'] = (df[f'Field_{j}'] - df[f'Field_{i}']).dt.days
    
    # Age, month
    df['age'] = 2020 - pd.DatetimeIndex(df['ngaySinh']).year
    df['birth_month'] = pd.DatetimeIndex(df['ngaySinh']).month
    
    # Days from current time & isWeekday
    for col in cat_cols:
        name = col.split('_')[-1]
        df[f'is_WD_{name}'] = df[col].dt.dayofweek.isin(range(5))
        df[f'days_from_now_{name}'] = (datetime.now() - pd.DatetimeIndex(df[col])).days
        df[col] = df[col].dt.strftime('%m-%Y')
    
    # Delta for x_startDate and x_endDate
    for cat in ['F', 'E', 'C', 'G', 'A']:
        df[f'{cat}_startDate'] = pd.to_datetime(df[f"{cat}_startDate"], infer_datetime_format=True)
        df[f'{cat}_endDate'] = pd.to_datetime(df[f"{cat}_endDate"], infer_datetime_format=True)
        
        df[f'{cat}_start_end'] = (df[f'{cat}_endDate'] - df[f'{cat}_startDate']).dt.days
        
    for i, j in zip('F E C G'.split(), 'E C G A'.split()):
        df[f'{j}_{i}_startDate'] = (df[f'{j}_startDate'] - df[f'{i}_startDate']).dt.days
        df[f'{j}_{i}_endDate'] = (df[f'{j}_endDate'] - df[f'{i}_endDate']).dt.days
    
    temp_date = [f'{i}_startDate' for i in 'ACEFG'] + [f'{i}_endDate' for i in 'ACEFG']
    
    for col in temp_date:
        df[col] = df[col].dt.strftime('%m-%Y')
        
    for col in cat_cols + temp_date:
        df[col] = df[col]
        
    return df

In [15]:
unicode_cols = ['Field_18', 'maCv', 'diaChi', 'Field_46', 'Field_48', 'Field_49', 'Field_56', 'Field_61', 'homeTownCity', 
                'homeTownName', 'currentLocationCity', 'currentLocationName', 'currentLocationState', 'homeTownState']
object_cols = (unicode_cols + 
               [f'Field_{str(i)}' for i in '4 12 36 38 47 62 45 54 55 65 66 68'.split()] +
               ['data.basic_info.locale', 'currentLocationCountry', 'homeTownCountry', 'brief'])

def str_normalize(s):
    s = str(s).strip().lower()
    s = re.sub(' +', " ", s)
    return s

def combine_gender(s):
    x, y = s 
    if x != x and y != y: return "nan"
    if x != x: return y.lower()
    return x.lower()

def process_categorical_cols(df):
    df['diaChi'] = df['diaChi'].str.split(',').str[-1]
    df[unicode_cols] = df[unicode_cols].applymap(str_normalize).applymap(lambda x: unidecode(x) if x==x else x)
    
    # Normalize some columns
    df["Field_38"] = df["Field_38"].map({0: 0.0, 1: 1.0, "DN": np.nan, "TN": np.nan, "GD": np.nan})
    df["Field_62"] = df["Field_62"].map({"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5, "Ngoài quốc doanh Quận 7": np.nan})
    df["Field_47"] = df["Field_47"].map({"Zezo": 0, "One": 1, "Two": 2, "Three": 3, "Four": 4})
    
    # Make some new features
    df['Field_45_Q'] = df['Field_45'].str[:-3].astype('category')
    df['Field_45_TP_55'] = df['Field_45'].str[:2] == df['Field_55']
    df['is_homeTown_diaChi'] = df['homeTownCity'] == df['diaChi']
    df['is_homeTown_current_city'] = df['homeTownCity'] == df['currentLocationCity']
    df['is_homeTown_current_state'] = df['homeTownState'] == df['currentLocationState']
    df['F48_49'] = df['Field_48'] == df['Field_49']
    
    df["gender"] = df[["gioiTinh", "info_social_sex"]].apply(combine_gender, axis=1).astype("category")
    
    df[["currentLocationLocationId", "homeTownLocationId", "currentLocationLatitude", "currentLocationLongitude", 
        "homeTownLatitude", "homeTownLongitude"]].replace(0, np.nan, inplace=True) # value == 0: noisy

    df[["currentLocationLocationId", "homeTownLocationId"]] = (df[["currentLocationLocationId", "homeTownLocationId"]]
                                                             .applymap(str_normalize).astype("category"))
    df[object_cols] = df[object_cols].astype('category')
    
    return df

In [16]:
# New feature from columns 63, 64
def process_63_64(z):
    x, y = z
    if x != x and y != y:
        return np.nan
    if (x, y) in [(1.0, 2.0), (2.0, 3.0), (3.0, 4.0), (4.0, 8.0), (7.0, 5.0), (5.0, 6.0), (9.0, 43.0), (8.0, 9.0)]: return True
    else: return False
    
def process_others(df):        
    df[["Field_27", "Field_28"]].replace(0.0, np.nan, inplace=True)
    df['F18_isnumeric'] = df['Field_18'].str.isnumeric()
    df['F18_isalpha'] = df['Field_18'].str.isalpha()
    
    # Delta from some pairs of columns
    for i, j in [(20, 27), (28, 27), (39, 41), (41, 42), (50, 51), (51, 53)]:
        df[f'F{str(i)}_{str(j)}_delta'] = df[f'Field_{str(j)}'] - df[f'Field_{str(i)}']
    df['F_59_60'] = df['Field_59'] - df['Field_60'] - 2
    df['F_63_64'] = df[['Field_63', 'Field_64']].apply(process_63_64, axis=1).astype('category')
    
    # Mean, std from partnerX columns
    for i in '1 2 3 4 5'.split():
        col = [c for c in df.columns if f'partner{i}' in c]
        df[f'partner{i}_mean'] = df[col].mean(axis=1)
        df[f'partner{i}_std'] = df[col].std(axis=1)

    # Reference columns
    columns = set(df.columns).difference(ignore_columns)
    df['cnt_NaN'] = df[columns].isna().sum(axis=1)
    df['cnt_True'] = df[columns].applymap(lambda x: isinstance(x, bool) and x).sum(axis=1)
    df['cnt_False'] = df[columns].applymap(lambda x: isinstance(x, bool) and not x).sum(axis=1)

    # Combinations of auto columns
    lst_combination = (list(combinations(auto_columns_2, 2)) + list(combinations(auto_columns_3, 2)) + list(combinations(auto_columns_4, 2)))
    for l, r in lst_combination:
        for func in 'add subtract divide multiply'.split():
            df[f'auto_{func}_{l}_{r}'] = getattr(np, func)(df[l], df[r])
            
    return df


In [17]:
def transform(df):
    df = process_datetime_cols(df)
    df = process_categorical_cols(df)
    df = process_others(df)
    return df.drop(ignore_columns, axis=1)

train = transform(train)
test = transform(test)

In [18]:
def split_dates(df):
    dates = [f"{c}_startDate" for c in ['F','E','C','G','A']] + [f"{c}_endDate" for c in ['F','E','C','G','A']]
    for date in dates:
        df[date+'_day'] = df[date].dt.day
        df[date+'_month'] = df[date].dt.month
        df[date+'_year'] = df[date].dt.year
        df[date+'_week'] = df[date].dt.week
        df[date+'_dayofweek'] = df[date].dt.dayofweek
    return df

In [19]:
def days_between_startEnd(df):
    start_dates = [f"{c}_startDate" for c in ['F','E','C','G','A']]
    end_dates = [f"{c}_endDate" for c in ['F','E','C','G','A']]
    col = ['F','E','C','G','A']
    for i in range(5):
        df[col[i]+'_delta'] = (df[end_dates[i]]-df[start_dates[i]]).dt.total_seconds()/(60*60*24)
    return df

In [20]:
def to_datetime(df):
    dates = [f"{c}_startDate" for c in ['F','E','C','G','A']] + [f"{c}_endDate" for c in ['F','E','C','G','A']]
    for col in dates:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

In [21]:
dates = [f"{c}_startDate" for c in ['F','E','C','G','A']] + [f"{c}_endDate" for c in ['F','E','C','G','A']]
dates_columns = ['F_delta','E_delta','C_delta','G_delta','A_delta']
for d in dates:
    dates_columns.append(d+'_day')
    dates_columns.append(d+'_month')
    dates_columns.append(d+'_year')
    dates_columns.append(d+'_week')
    dates_columns.append(d+'_dayofweek')
dates_columns

['F_delta',
 'E_delta',
 'C_delta',
 'G_delta',
 'A_delta',
 'F_startDate_day',
 'F_startDate_month',
 'F_startDate_year',
 'F_startDate_week',
 'F_startDate_dayofweek',
 'E_startDate_day',
 'E_startDate_month',
 'E_startDate_year',
 'E_startDate_week',
 'E_startDate_dayofweek',
 'C_startDate_day',
 'C_startDate_month',
 'C_startDate_year',
 'C_startDate_week',
 'C_startDate_dayofweek',
 'G_startDate_day',
 'G_startDate_month',
 'G_startDate_year',
 'G_startDate_week',
 'G_startDate_dayofweek',
 'A_startDate_day',
 'A_startDate_month',
 'A_startDate_year',
 'A_startDate_week',
 'A_startDate_dayofweek',
 'F_endDate_day',
 'F_endDate_month',
 'F_endDate_year',
 'F_endDate_week',
 'F_endDate_dayofweek',
 'E_endDate_day',
 'E_endDate_month',
 'E_endDate_year',
 'E_endDate_week',
 'E_endDate_dayofweek',
 'C_endDate_day',
 'C_endDate_month',
 'C_endDate_year',
 'C_endDate_week',
 'C_endDate_dayofweek',
 'G_endDate_day',
 'G_endDate_month',
 'G_endDate_year',
 'G_endDate_week',
 'G_endDate_da

In [22]:
def impute_df(df):    
    for col in dates_columns:
        df[col] = df[col].fillna(df[col].mean())
    return df

In [23]:
train = to_datetime(train)
test = to_datetime(test)
train = split_dates(train)
test = split_dates(test)
train = days_between_startEnd(train)
test = days_between_startEnd(test)

In [24]:
train = impute_df(train)
test = impute_df(test)

In [25]:
#Support catboost modelling
cat_features = [c for c in train.columns if (train[c].dtype not in [np.float64, np.int64])]
train[cat_features] = train[cat_features].astype(str)
test[cat_features] = test[cat_features].astype(str)

In [27]:
# Create the encoder
t = pd.concat([train, test]).reset_index(drop=True)
count_enc = ce.CountEncoder().fit_transform(t[cat_features])
tt = t.join(count_enc.add_suffix("_count"))

f2_train = tt.loc[tt.index < train.shape[0]]
f2_test = tt.loc[tt.index >= train.shape[0]]

columns = sorted(set(f2_train.columns).intersection(f2_test.columns))
print(len(columns))


2275


In [28]:
# Some classifiers from Scikit-learn:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [29]:
# LightGBM, Catboost and XGBoost:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [40]:
# Initative estimators:
ran = RandomForestClassifier(random_state=9)
gbm = LGBMClassifier()
#log = LogisticRegression()
#gbc = GradientBoostingClassifier()
#xgb = XGBClassifier()
#ext = ExtraTreesClassifier()
#ada = AdaBoostClassifier()
#gnb = GaussianNB()
#gpc = GaussianProcessClassifier()
#bag = BaggingClassifier()
#nnn = MLPClassifier()
cat = CatBoostClassifier()

In [41]:
# List of classifiers:
models = [ran, gbm, cat]

In [47]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import os
print("\n@Learning")
n_folds = 5
seed = 2020
cd1 = '/content/drive/My Drive/kalapa'    
print("+ Data Splitting")

f2_train.label.replace("Good", 0, inplace=True)
f2_train.label.replace("Bad", 1, inplace=True)
print(f"Stratified {n_folds}-fold, seed={seed}")
y = f2_train["label"].values
#cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed) ##Update
cv = StratifiedShuffleSplit(n_splits = n_folds, test_size = 0.2) ##Updated
for i, (train, val) in enumerate(cv.split(np.zeros(len(y)), y)):
    print("FOLD %d" % (i + 1))
    os.makedirs(os.path.join(cd1, "fold%d" % i), exist_ok=True)
    train_df, val_df = f2_train.loc[train], f2_train.loc[val]
    # use all positive examples for training and evaluation
    train_df = pd.concat([train_df, val_df[val_df.label == 1]])
    val_df = pd.concat([val_df, train_df[train_df.label == 1]])
    train_df.to_csv(os.path.join(cd1, "fold%d/train.csv" % i), index=False)
    val_df.to_csv(os.path.join(cd1, "fold%d/val.csv" % i), index=False)

print("-"*50)


@Learning
+ Data Splitting
Stratified 5-fold, seed=2020
FOLD 1
FOLD 2
FOLD 3
FOLD 4
FOLD 5
--------------------------------------------------


In [48]:
i = 1
train_df = pd.read_csv(os.path.join(cd1, "fold%d/train.csv" % i))
y_train = train_df["label"].to_numpy(dtype=np.float32)
X_train = train_df.drop(columns=["label"]).to_numpy(dtype=np.float32)

ValueError: ignored

In [43]:
import numpy as np

recall_mean = []
recall_sd = []
auc_mean = []
auc_sd = []

for mod in models:
    acc = cross_val_score(mod, X_train, y_train, scoring="recall", cv=cv, verbose=False, n_jobs=-1)
    auc = cross_val_score(mod, X_train, y_train, scoring="roc_auc", cv=cv, verbose=False, n_jobs=-1)
    # Recall metric:
    recall_mean.append(acc.mean())
    recall_sd.append(np.std(acc))
    # AUC metric:
    auc_mean.append(auc.mean())
    auc_sd.append(np.std(auc))

TerminatedWorkerError: ignored

In [44]:
# Convert results in form of pandas frame:
df_results = pd.DataFrame({"Model": [j.__class__.__name__ for j in models],
                           "Recall_mean": recall_mean,
                           "Recall_sd": recall_sd,
                           "AUC_mean": auc_mean,
                           "AUC_sd": auc_sd})

df_results = df_results.sort_values(by="Recall_mean", ascending=False).reset_index(drop=True)

ValueError: ignored

In [None]:
# Show results:
print(df_results)

In [None]:
# Train Random Forest, GaussianNB and CatBoostClassifier:
ran.fit(X_train, y_train)
gnb.fit(X_train, y_train)
cat.fit(X_train, y_train)

In [None]:
# Probability:
pd_ran = ran.predict_proba(X_test)[:, 1]
pd_gnb = gnb.predict_proba(X_test)[:, 1]
pd_cat = cat.predict_proba(X_test)[:, 1]

In [None]:
# AUC by RandomForest and GaussianNB:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, pd_ran))
print(roc_auc_score(y_test, pd_gnb))
print(roc_auc_score(y_test, pd_cat))

In [None]:
# Function calculates profit with given cutoff when interest rate of 10%:

def profit_by_cutoff(cutoff, pred_prob):
    rate = 0.1
    pred_bg = (pred_prob >= cutoff).astype(int)
    gg = X_test[(y_test == 0) & (pred_bg == 0)]
    bg = X_test[(y_test == 1) & (pred_bg == 0)]
    profit = np.sum(rate * gg["AMOUNT"]) - np.sum(bg["AMOUNT"])
    return profit

In [None]:
def profit(cutoff):
    pro_ran = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_ran)
    pro_gnb = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_gnb)
    pro_cat = profit_by_cutoff(cutoff=cutoff, pred_prob=pd_cat)
    df_pro = pd.DataFrame({"Profit_RAN": [pro_ran],
                           "Profit_GNB": [pro_gnb],
                           "Profit_CAT": [pro_cat],
                           "Cutoff": [cutoff]})

    return df_pro

In [None]:
# If cutoff = 0.02:
profit_002 = profit(cutoff=0.02)
print(profit_002)

# Profit for the two models by a range of cutoff:
df_profit = pd.DataFrame()

for j in np.arange(0.01, 0.3, 0.005):
    df_j = profit(j)
    df_profit = df_profit.append(df_j)

In [None]:
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

plt.plot("Cutoff", "Profit_RAN", data=df_profit, label="RandomForest", lw=2)
plt.plot("Cutoff", "Profit_GNB", data=df_profit, label="GaussianNB", lw=2)
plt.plot("Cutoff", "Profit_CAT", data=df_profit, label="CatBoost", lw=2)
plt.title("Profit by Cutoff and classifier")
plt.xlabel("Cutoff")
plt.ylabel("Profit")
plt.legend()
plt.show()

0.5072735159107776