In [48]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import  
파이썬에서 다른 모듈이나 패키지에서 함수, 클래스 등을 가져옵니다.

In [49]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier

# 시드(seed) 고정  
매번 고정된 결과를 얻기 위해서 사용합니다.  
시드를 고정하지 않는다면 같은 코드라도 매번 다른 결과가 나올 수 있습니다.

In [50]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 데이터 읽어오기 및 데이터 확인

In [51]:
# pd.read_csv() 함수를 사용해서 데이터를 읽어오는 코드입니다.
train = pd.read_csv('/content/drive/MyDrive/dataset/crime_pred/train.csv')
test = pd.read_csv('/content/drive/MyDrive/dataset/crime_pred/test.csv')

# 데이터를 확인하기 위해 head() 함수를 사용합니다.
train.head(5)

Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.0,0.0,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375,0.0,0.0,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.0,0.0,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2


In [52]:
train.describe()

Unnamed: 0,월,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,TARGET
count,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0
mean,6.430195,6.769507,1060.027581,26.881726,1.912424,24.608776,2.284407,23.430503,186.926107,0.385423,0.017842,0.144042,0.02033,0.01026,0.210755,0.008921,0.835355
std,3.108302,3.56639,698.380485,13.870968,0.958556,62.711211,15.852881,85.199896,98.299485,0.486698,0.132379,0.351134,0.141128,0.100771,0.407847,0.09403,0.819762
min,1.0,1.0,26.0,5.0,0.012269,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,4.0,526.0,13.0,1.209985,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,7.0,937.0,27.0,1.822279,0.625,0.0,0.0,205.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,9.0,10.0,1638.0,38.0,2.476528,18.571429,0.0,0.0,260.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,12.0,12.0,2450.0,54.0,4.998936,614.875,295.0,649.8,360.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


# 독립변수(x_train), 종속변수(y_train)로 분리하기  

우리는 독립변수를 사용하여 종속변수를 예측하여야 합니다.  
우리가 예측해야하는 종속변수는 'TARGET' 컬럼입니다.  
따라서 종속변수를 y_train 변수에 할당하고 독립변수는 x_train 변수에 할당합니다.  

ID는 모델이 종속변수를 예측할 때 도움이 되지 않으므로 해당 컬럼을 제거합니다.  

In [53]:
x_train = train.drop(['ID', 'TARGET'], axis = 1)
y_train = train['TARGET']

x_test = test.drop('ID', axis = 1)

# 라벨인코딩(Label Encoding) 
라벨인코딩은 범주형(categorical) 변수를 수치형(numerical) 변수로 변환합니다.    
대부분의 머신러닝 알고리즘은 수치형 데이터를 입력으로 받기 때문에,   
범주형 데이터를 수치형 데이터로 변환하여 모델에 입력할 필요가 있습니다.   

In [54]:
ordinal_features = ['요일', '소관경찰서', '소관지역', '범죄발생지']

for feature in ordinal_features:
    oe = OrdinalEncoder(handle_unknown='use_encoded_value',
                         unknown_value=-2)
    oe = oe.fit(x_train[feature].to_numpy().reshape(-1, 1))
    x_train[feature] = oe.transform(x_train[feature].to_numpy().reshape(-1, 1))
    x_test[feature] = oe.transform(x_test[feature].to_numpy().reshape(-1, 1))

In [55]:
x_train.describe()

Unnamed: 0,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지
count,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0,84406.0
mean,6.430195,3.039902,6.769507,126.886702,21.881726,1.912424,24.608776,2.284407,23.430503,186.926107,0.385423,0.017842,0.144042,0.02033,0.01026,0.210755,0.008921,7.944862
std,3.108302,1.99812,3.56639,77.369365,13.870968,0.958556,62.711211,15.852881,85.199896,98.299485,0.486698,0.132379,0.351134,0.141128,0.100771,0.407847,0.09403,2.261395
min,1.0,0.0,1.0,0.0,0.0,0.012269,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,1.0,4.0,59.0,8.0,1.209985,0.0,0.0,0.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
50%,7.0,3.0,7.0,121.0,22.0,1.822279,0.625,0.0,0.0,205.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
75%,9.0,5.0,10.0,195.0,33.0,2.476528,18.571429,0.0,0.0,260.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
max,12.0,6.0,12.0,268.0,49.0,4.998936,614.875,295.0,649.8,360.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.0


# 모델 정의

In [56]:
# Colab에서 기본 제공하지 않는 라이브러리라 별도 설치가 필요합니다.
!pip install catboost # CatBoost
!pip install optuna # 하이퍼파라미터 최적화 프레임워크

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [57]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier, Pool

In [71]:
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, train_size=0.7, random_state=42) # train test split

In [85]:
# catboost, optuna 파라미터, 함수 정의
from sklearn.metrics import f1_score

sampler = TPESampler(seed=113)

# define function
def objective(trial):

    # Catboost parameter [출처: https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier ]
    cbrm_param = {
        # "objective": "MultiClass",
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 1),
        'iterations':trial.suggest_int("iterations", 10, 100),
        'random_state': 42,
        'use_best_model': True,
    }

    # 모델 학습에 사용할 feature
    FEATURE = ['월', '요일', '시간', '소관경찰서', '소관지역', '사건발생거리', '강수량(mm)', '강설량(mm)', '적설량(cm)', '풍향',
               '안개', '짙은안개', '번개', '진눈깨비', '서리', '연기/연무', '눈날림', '범죄발생지']

    # train, validation X, y를 Pool로 합칩니다
    train_pool = Pool(data=x_train.astype('int'), label=y_train, cat_features=FEATURE)
    valid_pool = Pool(data=x_validation.astype('int'), label=y_validation, cat_features=FEATURE)

    # model 생성, 학습
    model_cbrm = CatBoostClassifier(**cbrm_param)
    model_cbrm = model_cbrm.fit(train_pool, eval_set=valid_pool, 
                           verbose=0)

    # validation data의 f1_score 계산                
    f1 = f1_score(y_validation, model_cbrm.predict(x_validation.astype('int')), average='macro')
    return f1

# optun를 이용해 파라미터 최적화를 진행합니다. (50회)
optuna_cbrm = optuna.create_study(direction='maximize', sampler=sampler)
optuna_cbrm.optimize(objective, n_trials=100, timeout = 1800)

[32m[I 2023-05-25 05:21:52,268][0m A new study created in memory with name: no-name-9aba5307-8e74-427d-93f7-c47d414e9838[0m
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 1),
[32m[I 2023-05-25 05:21:54,076][0m Trial 0 finished with value: 0.5110830814617148 and parameters: {'learning_rate': 0.5057908697718689, 'iterations': 16}. Best is trial 0 with value: 0.5110830814617148.[0m
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 1),
[32m[I 2023-05-25 05:21:59,889][0m Trial 1 finished with value: 0.5154456305170995 and parameters: {'learning_rate': 0.6164012707848365, 'iterations': 49}. Best is trial 1 with value: 0.5154456305170995.[0m
  'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 1),
[32m[I 2023-05-25 05:22:05,621][0m Trial 2 finished with value: 0.5058802711528313 and parameters: {'learning_rate': 0.018003438872135874, 'iterations': 62}. Best is trial 1 with value: 0.5154456305170995.[0m
  'learning_rate' : trial

In [75]:
# 가장 최적의 결과를 선택합니다.
cbrm_trial = optuna_cbrm.best_trial
cbrm_trial_params = cbrm_trial.params
print('Best Trial: score {},\nparams {}'.format(cbrm_trial.value, cbrm_trial_params))

cbrm_trial_params

Best Trial: score 0.5172321677149251,
params {'learning_rate': 0.4155556518804766, 'iterations': 41}


{'learning_rate': 0.4155556518804766, 'iterations': 41}

In [76]:
# 앞의 파라미터 최적화로 학습된 최적 파라미터를 params로 옮깁니다
params = cbrm_trial_params

In [77]:
cat_cols = ['월', '요일', '시간', '소관경찰서', '소관지역', '사건발생거리', '강수량(mm)', '강설량(mm)', '적설량(cm)', 
               '풍향', '안개', '짙은안개', '번개', '진눈깨비', '서리', '연기/연무', '눈날림', '범죄발생지']

In [78]:
train_pool = Pool(data=x_train.astype('int'), label=y_train, cat_features=cat_cols)
eval_pool = Pool(data=x_validation.astype('int'), label=y_validation, cat_features=cat_cols)
test_pool = Pool(data=x_test.astype('int'), cat_features=cat_cols)

In [79]:
# 최적의 파라미터를 사용하는 학습 모델을 생성합니다
from catboost import CatBoostClassifier

cbrm_model = CatBoostClassifier(**params)
cbrm_model.fit(train_pool, eval_set=eval_pool,use_best_model=True)

0:	learn: 1.0314623	test: 1.0328032	best: 1.0328032 (0)	total: 96.7ms	remaining: 3.87s
1:	learn: 0.9996976	test: 1.0010277	best: 1.0010277 (1)	total: 195ms	remaining: 3.8s
2:	learn: 0.9851405	test: 0.9864399	best: 0.9864399 (2)	total: 285ms	remaining: 3.61s
3:	learn: 0.9795785	test: 0.9810699	best: 0.9810699 (3)	total: 381ms	remaining: 3.53s
4:	learn: 0.9758125	test: 0.9772044	best: 0.9772044 (4)	total: 469ms	remaining: 3.38s
5:	learn: 0.9742162	test: 0.9753337	best: 0.9753337 (5)	total: 552ms	remaining: 3.22s
6:	learn: 0.9715371	test: 0.9727023	best: 0.9727023 (6)	total: 638ms	remaining: 3.1s
7:	learn: 0.9696651	test: 0.9709582	best: 0.9709582 (7)	total: 728ms	remaining: 3s
8:	learn: 0.9685292	test: 0.9703154	best: 0.9703154 (8)	total: 812ms	remaining: 2.88s
9:	learn: 0.9680398	test: 0.9700055	best: 0.9700055 (9)	total: 915ms	remaining: 2.83s
10:	learn: 0.9667193	test: 0.9695776	best: 0.9695776 (10)	total: 998ms	remaining: 2.72s
11:	learn: 0.9661841	test: 0.9692668	best: 0.9692668 (11

<catboost.core.CatBoostClassifier at 0x7f351cbeb370>

In [80]:
val_pred = cbrm_model.predict(eval_pool)

from sklearn.metrics import classification_report
print(classification_report(y_validation,val_pred,digits=5))

              precision    recall  f1-score   support

           0    0.56172   0.72534   0.63313      7704
           1    0.58799   0.42316   0.49214      5251
           2    0.46812   0.39237   0.42691      4771

    accuracy                        0.54620     17726
   macro avg    0.53927   0.51362   0.51739     17726
weighted avg    0.54431   0.54620   0.53586     17726



# 예측

In [81]:
# predict() 함수는 독립변수(테스트데이터)를 입력받았을 때 종속변수를 예측합니다.
pred = cbrm_model.predict(x_test.astype('int'))

# 파일 저장

In [82]:
# 제출 파일을 읽어옵니다.
submit = pd.read_csv('/content/drive/MyDrive/dataset/crime_pred/sample_submission.csv')

In [83]:
# 예측한 값을 TARGET 컬럼에 할당합니다.
submit['TARGET'] = pred
submit.head()

Unnamed: 0,ID,TARGET
0,TEST_00000,2
1,TEST_00001,0
2,TEST_00002,0
3,TEST_00003,0
4,TEST_00004,0


In [84]:
# 예측한 결과를 파일로 저장합니다. index 인자의 값을 False로 설정하지 않으면 제출이 정상적으로 진행되지 않습니다.
submit.to_csv('/content/drive/MyDrive/dataset/crime_pred/submit.csv', index = False)