In [730]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [731]:
!pip install catboost



In [732]:
data = pd.read_csv('/content/drive/MyDrive/kaggle_3/train.csv')
test = pd.read_csv('/content/drive/MyDrive/kaggle_3/test.csv')

In [733]:
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,A_x,A_y,A_z,B_x,B_y,B_z,label
0,0,2019-01-12 00:45:54.450,-0.25913,-0.834869,-0.485499,0.196409,,0.384934,8
1,1,2000-01-01 01:37:06.440,0.37049,0.175042,0.122625,-0.338242,0.358245,0.126491,2
2,2,2019-01-12 00:45:33.900,-0.257837,-0.881947,-0.391895,0.196027,0.894537,0.411221,8
3,3,2000-01-01 00:46:22.680,-0.937753,-0.055961,0.362041,-0.929881,0.087673,0.134609,11
4,4,2000-01-01 00:49:56.620,-0.98832,-0.19039,0.157909,-0.954669,-0.02481,-0.38842,6


In [734]:
data['timestamp'] = pd.to_datetime(data['timestamp'])

data['year'] = data['timestamp'].dt.year
data['month'] = data['timestamp'].dt.month
data['day'] = data['timestamp'].dt.day
data['hour'] = data['timestamp'].dt.hour
data['minute'] = data['timestamp'].dt.minute
data['second'] = data['timestamp'].dt.second

In [735]:
test['timestamp'] = pd.to_datetime(test['timestamp'])

test['year'] = test['timestamp'].dt.year
test['month'] = test['timestamp'].dt.month
test['day'] = test['timestamp'].dt.day
test['hour'] = test['timestamp'].dt.hour
test['minute'] = test['timestamp'].dt.minute
test['second'] = test['timestamp'].dt.second

In [736]:
data.drop(columns=['timestamp'], inplace=True)
test.drop(columns=['timestamp'], inplace=True)

In [737]:
missing_columns = data.columns[data.isnull().any()].tolist()
print(missing_columns)

['A_x', 'A_y', 'A_z', 'B_x', 'B_y', 'B_z']


In [738]:
missing_columns = test.columns[test.isnull().any()].tolist()
print(missing_columns)

[]


In [739]:
data['A_x'].interpolate(inplace=True)
data['A_y'].interpolate(inplace=True)
data['A_z'].interpolate(inplace=True)
data['B_x'].interpolate(inplace=True)
data['B_y'].interpolate(inplace=True)
data['B_z'].interpolate(inplace=True)

In [740]:
# 데이터 시작점에 있는 결측치가 사라지지 않아서
# B-y칼럼만 fillna 메소드 사용
data['B_y'].fillna(method='bfill', inplace=True)

In [741]:
missing_columns = data.columns[data.isnull().any()].tolist()
print(missing_columns)

[]


In [743]:
data.head()

Unnamed: 0.1,Unnamed: 0,A_x,A_y,A_z,B_x,B_y,B_z,label,year,month,day,hour,minute,second
0,0,-0.25913,-0.834869,-0.485499,0.196409,0.358245,0.384934,8,2019,1,12,0,45,54
1,1,0.37049,0.175042,0.122625,-0.338242,0.358245,0.126491,2,2000,1,1,1,37,6
2,2,-0.257837,-0.881947,-0.391895,0.196027,0.894537,0.411221,8,2019,1,12,0,45,33
3,3,-0.937753,-0.055961,0.362041,-0.929881,0.087673,0.134609,11,2000,1,1,0,46,22
4,4,-0.98832,-0.19039,0.157909,-0.954669,-0.02481,-0.38842,6,2000,1,1,0,49,56


In [744]:
target = 'label'
x = data.drop(columns = target, axis = 1)
y = data.loc[:, target]

In [745]:
#x_test = test

In [746]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 1)

In [747]:
from catboost import CatBoostClassifier, Pool
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [748]:
model = CatBoostClassifier(
    iterations=10000,  # 반복 횟수
    learning_rate=0.03,  # 학습률
    depth=7,  # 트리의 깊이
    l2_leaf_reg=3,  # L2 정규화
    loss_function='MultiClass',  # 손실 함수
    eval_metric='MultiClass',  # 평가 메트릭
    early_stopping_rounds=150,  # 조기 중단
    random_seed=40,  # 재현성을 위한 시드
   # feature_border_type = 'MaxLogSum',
    verbose=100,  # 로그 출력 간격
    task_type="GPU",  # GPU 사용
)

In [749]:
train_pool = Pool(x_train, y_train)
test_pool = Pool(x_test, y_test)

In [750]:
model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True  # 검증 세트에서 가장 좋은 모델을 사용
)


0:	learn: 2.2396437	test: 2.2395129	best: 2.2395129 (0)	total: 10.3ms	remaining: 1m 43s
100:	learn: 0.5138244	test: 0.5222444	best: 0.5222444 (100)	total: 917ms	remaining: 1m 29s
200:	learn: 0.3061711	test: 0.3154865	best: 0.3154865 (200)	total: 1.73s	remaining: 1m 24s
300:	learn: 0.2218124	test: 0.2314029	best: 0.2314029 (300)	total: 2.51s	remaining: 1m 20s
400:	learn: 0.1754540	test: 0.1855294	best: 0.1855294 (400)	total: 3.32s	remaining: 1m 19s
500:	learn: 0.1454249	test: 0.1558897	best: 0.1558897 (500)	total: 4.12s	remaining: 1m 18s
600:	learn: 0.1259236	test: 0.1368441	best: 0.1368441 (600)	total: 4.94s	remaining: 1m 17s
700:	learn: 0.1111223	test: 0.1226572	best: 0.1226572 (700)	total: 5.74s	remaining: 1m 16s
800:	learn: 0.1000338	test: 0.1121313	best: 0.1121313 (800)	total: 6.56s	remaining: 1m 15s
900:	learn: 0.0911789	test: 0.1041803	best: 0.1041803 (900)	total: 7.36s	remaining: 1m 14s
1000:	learn: 0.0838592	test: 0.0974399	best: 0.0974399 (1000)	total: 8.13s	remaining: 1m 13s


<catboost.core.CatBoostClassifier at 0x7c1c49bff760>

In [751]:
# 테스트 데이터 예측
predictions = model.predict(x_test)

# 정확도 평가
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9826


In [752]:
X_test = test

In [753]:
predictions = model.predict(X_test)
print(predictions)

[[ 6]
 [ 4]
 [10]
 ...
 [ 4]
 [ 1]
 [ 7]]


In [754]:
test_ids = test.index

In [755]:
predictions_flatten = predictions.flatten()
submission = pd.DataFrame({'ID': test_ids, 'label': predictions_flatten})


In [756]:
submission.to_csv('submission9.csv', index=False)

In [757]:
print(submission.head())

   ID  label
0   0      6
1   1      4
2   2     10
3   3      9
4   4      7
