In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from tqdm import tqdm
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association

from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from catboost import CatBoostClassifier, Pool

from imblearn.under_sampling import RandomUnderSampler

import optuna
import random
random.seed(42)
random_seed = 42

import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = 'Malgun Gothic'

In [2]:
df = pd.read_csv('./train.csv', encoding='UTF-8')

## Catboost를 통한 분류 모델

In [3]:
# 전처리 결과 '소관경찰서','범죄발생지','사건발생거리' 변수만 TARGET과 유의미한 상관관계를 가짐
df_ = df[['소관경찰서', '범죄발생지','사건발생거리']].copy()
encoder = LabelEncoder()
df_['범죄발생지'] = encoder.fit_transform(df_['범죄발생지'])
target = df['TARGET'].copy()
trainx, testx, trainy, testy = train_test_split(df_, target, test_size=.2, random_state=42)

In [4]:
model = CatBoostClassifier(verbose=False)
model.fit(trainx, trainy)
pred = model.predict(testx)
print(confusion_matrix(testy,pred))
print(f"f1 score: {f1_score(testy, pred, average='macro')}")
print(pd.DataFrame(model.get_feature_importance(), index=df_.columns))

[[5352  899 1040]
 [1963 2310  862]
 [2021  710 1725]]
f1 score: 0.5269552018349899
                0
소관경찰서   34.880432
범죄발생지   49.479933
사건발생거리  15.639635


## 하이퍼파라미터 최적화

In [5]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 500, 2000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        random_seed=42,
        verbose=False
    )
    model.fit(trainx, trainy)
    pred = model.predict(testx)
    return f1_score(testy, pred, average='macro')

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

In [7]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  100
Best trial:
  Value:  0.5307711548292552
  Params: 
    iterations: 1447
    learning_rate: 0.03906611024058157
    depth: 7
    l2_leaf_reg: 3.127390407286506e-07
    bootstrap_type: Bayesian
    random_strength: 0.04166945670743934
    bagging_temperature: 0.001916137823568892
    od_type: Iter
    od_wait: 15


In [8]:
model = CatBoostClassifier(**trial.params, verbose=False)
model.fit(trainx, trainy)
pred = model.predict(testx)
print(confusion_matrix(testy,pred))
print(f"f1 score: {f1_score(testy, pred, average='macro')}")
print(pd.DataFrame(model.get_feature_importance(), index=df_.columns))

[[5344  878 1069]
 [1931 2335  869]
 [2008  696 1752]]
f1 score: 0.530459886394124
                0
소관경찰서   35.685160
범죄발생지   43.329848
사건발생거리  20.984992


In [11]:
testdf = pd.read_csv('./test.csv', encoding='UTF-8')[['소관경찰서','범죄발생지','사건발생거리']]
answersheet = pd.read_csv('./sample_submission.csv', encoding='UTF-8')

testdf['범죄발생지'] = encoder.transform(testdf['범죄발생지'])

answersheet['TARGET'] = model.predict(testdf)
answersheet.to_csv('./CatBoost_with_optuna.csv', encoding='UTF-8', index=False)