In [22]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
import warnings

import pandas_ta as ta
from xgboost import XGBRegressor
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost import XGBClassifier
from tqdm import tqdm
from joblib import Parallel, delayed

In [25]:
train = pd.read_pickle("./data/train_완료.pkl")
test = pd.read_pickle("./data/prediction_완료.pkl")

In [41]:
# XGBoost 하이퍼파라미터의 탐색 공간 정의
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'min_child_weight': [1, 3],
    'subsample': [0.7, 1],
    'colsample_bytree': [0.7, 1]
}

X = train.drop(['date', 'ticker', 'target', 'pct_change', 'class_target'], axis=1) 
y = train['class_target'] # class_target이 종속 변수 Y (분류 모델)

test_final_day = test[test['date'] == "2023-05-30"] # 상승/보합/하락 분류 모델이므로 마지막 날만 사용하면 됨.
X_pred = test_final_day.drop(['date', 'ticker', 'target'], axis=1)

model = XGBClassifier(tree_method='gpu_hist', n_jobs=-1)

# GridSearchCV를 사용하여 최적의 하이퍼파라미터 탐색
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X, y)

# 최적의 파라미터와 최고 정확도 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# 최적의 파라미터로 모델을 다시 학습
best_model = grid_search.best_estimator_
best_model.fit(X, y)

predictions = best_model.predict(X_pred)

In [39]:
result = pd.DataFrame(test_final_day['ticker'].unique(), predictions, columns=["종목코드"])
result = result.reset_index()
result["예상"] = result['index']
result = result.drop("index", axis=1)
result

Unnamed: 0,종목코드,예상
0,A000020,0
1,A000040,0
2,A000050,0
3,A000070,0
4,A000080,0
...,...,...
1952,A375500,0
1953,A378850,2
1954,A383220,0
1955,A383310,2


In [None]:
import matplotlib.pyplot as plt
# 레이블 분포 확인
label_counts = result['예상'].value_counts()

# 시각화
plt.figure(figsize=(8, 6))
plt.bar(label_counts.index, label_counts.values, color='skyblue')
plt.xticks(label_counts.index, ['매도', '관망', '매수'])
plt.xlabel('예상 레이블')
plt.ylabel('종목 수')
plt.title('레이블 분포')
plt.show()

In [17]:
sample = pd.read_csv("./sub/sample_submission.csv")
sample = sample.rename(columns = {'종목코드' : 'ticker', '순위' : 'rating'})

# sample의 'ticker' 컬럼에서 answer의 'ticker' 컬럼에 없는 값을 선택
missing_tickers = sample[~sample['ticker'].isin(answer['ticker'])]
missing_tickers = missing_tickers.reset_index(drop=True)
missing_tickers = missing_tickers.drop('rating', axis=1)
missing_tickers['return'] = 0
missing_tickers.head()

submission = pd.merge(answer, missing_tickers, how='outer')
submission = submission.sort_values('return', ascending=False)
submission = submission.reset_index()
submission = submission.reset_index()
submission = submission.drop('index', axis=1)
submission = submission.sort_values('ticker')
submission['순위2'] = submission['level_0']
submission = submission.drop(['level_0', 'return'], axis=1)
submission = submission.reset_index(drop=True)
submission['순위'] = submission['순위2'] + 1
submission = submission.rename(columns = {'ticker' : '종목코드'})
submission = submission.drop('순위2', axis=1)

submission.to_csv("XGBoost5.csv")

In [19]:
submission = pd.read_csv("./sub/XGBoost5.csv")
len(submission['순위'].unique())

1957