In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500) 

In [None]:
merged = pd.read_csv('merged.csv') # option: nrows=10000

In [None]:
del merged['Unnamed: 0']

In [None]:
# to make catboost know which features are categorical
merged['srno'] = merged['srno'].astype(str)
merged['YYYYMM'] = merged['YYYYMM'].astype(str)
merged['c_gender'] = merged['c_gender'].astype(str)
merged['c_zip'] = merged['c_zip'].astype(str)
merged['c_edu'] = merged['c_edu'].astype(str)
merged['c_mry'] = merged['c_mry'].astype(str)
merged['c_job'] = merged['c_job'].astype(str)
merged['c_occp'] = merged['c_occp'].astype(str)
merged['a_incm_flg'] = merged['a_incm_flg'].astype(str)
merged['x_flg_house'] = merged['x_flg_house'].astype(str)
merged['CAR_FLG'] = merged['CAR_FLG'].astype(str)

In [None]:
from utility_functions import *

In [None]:
del merged['py_total']
del merged['as_total']

In [None]:
train = merged[merged['YYYYMM'] != '201812']
test = merged[merged['YYYYMM'] == '201812']

In [None]:
del train['YYYYMM'] # 感覺這個東西不能訓練，如果當類別變數的話，基本上test是不可能跟train有一樣的時間的
del test['YYYYMM']

In [None]:
X = train.drop(['y1','y2'], axis=1)
y = train[['y1']]

In [None]:
print(X.dtypes)

In [None]:
categorical_features_indices = np.where(X.dtypes == object)[0]

In [None]:
categorical_features_indices

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=1337,stratify=y)


In [None]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
params = {
    'iterations': 200,
    'learning_rate': 0.1,
    'loss_function':'Logloss', #CrossEntropy?
    'eval_metric': 'F1',
    'random_seed': 1337,
    'logging_level': 'Silent',
    'class_weights' : [1, 100]
    #'scale_pos_weight' : 10   #(1410132-4786)/4786
    #,'use_best_model' : False
}

In [None]:
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

In [None]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool,plot=True)

In [None]:
%%time
#early stop model
earlystop_params = params.copy()
earlystop_params.update({
    'iterations': 10000,
    'od_type': 'Iter',
    # 'loss_function':'CrossEntropy', #CrossEntropy?
    'od_wait': 300,
    'class_weights' : None
    
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool,plot=True);

# 看simple model之feature importance

In [None]:
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

# 看early stop model之feature importancee

In [None]:
feature_importances = earlystop_model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

# Predict proba from simple model

In [None]:
X_test = test.drop(['y1','y2'], axis=1)
y_test = test[['y1']]

In [None]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)

In [None]:
threshold = 0.85# threshold we set where the probability prediction must be above this to be classified as a '1'
classes = predictions_probs[:,1] # say it is the class in the second column you care about predictint
classes[classes>=threshold] = 1
classes[classes<threshold] = 0

In [None]:
uniqueValues, occurCount = np.unique(classes, return_counts=True)
 
print("Unique Values : " , uniqueValues)
print("Occurrence Count : ", occurCount)

In [None]:
f1_score(y_test, predictions)

In [None]:
f1_score(y_test, classes)

In [None]:
#此為輸入成csv的語法
# np.savetxt("pred_proba(threshold=0.52,model(scr == int),scale pos weight = 10,800 iters,f1 = 0.1258).csv", classes, delimiter=",")

 # Early stop model predict

In [None]:
predictions = earlystop_model.predict(X_test)
predictions_probs = earlystop_model.predict_proba(X_test)

In [None]:
threshold = 0.6 # threshold we set where the probability prediction must be above this to be classified as a '1'
classes = predictions_probs[:,1] # say it is the class in the second column you care about predictint
classes[classes>=threshold] = 1
classes[classes<threshold] = 0

In [None]:
uniqueValues, occurCount = np.unique(classes, return_counts=True)
 
print("Unique Values : " , uniqueValues)
print("Occurrence Count : ", occurCount)

In [None]:
f1_score(y_test, predictions)

In [None]:
f1_score(y_test, classes)

In [None]:
#此為輸入成csv的語法
# np.savetxt("pred_proba(threshold=0.52,model(scr == int),scale pos weight = 10,800 iters,f1 = 0.1258).csv", classes, delimiter=",")