<a href="https://colab.research.google.com/github/2024-MJU-Capstone-Design/coinmerge-ml/blob/feature%2FlearnAI/Upbit_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyupbit



In [2]:
!pip install lightgbm==3.3.0



# 라이브러리 로드

In [3]:
import warnings
warnings.filterwarnings(action='ignore')
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from mlxtend.preprocessing import minmax_scaling
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rc('font', family='NanumBarunGothic')
import lightgbm as lgb
from xgboost import XGBRegressor
import pyupbit
import logging
from google.colab import drive
import os

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 2. 데이터 전처리 & 변수 생성

### RSI 구하는 방법
주어진 기간의 모든 가격에 대해서

- 가격이 전일 가격보다 상승한 날의 상승분은 U(up)이라고 하고,
- 가격이 전일 가격보다 하락한 날의 하락분은 D(down) 값이라 한다.
- U 값과 D 값의 평균값을 구하여 그것을 각각 AU(average ups)와 AD(average downs)라고 한다.
- AU를 AD값으로 나눈 것을 RS(relative strength) 값이라고 한다. RS 값이 크다는 것은 일정 기간 하락한 폭보다 상승한 폭이 크다는 것을 의미한다.

다음 계산에 의하여 RSI 값을 구한다.

- RSI 계산공식: RSI = RS / (1 + RS)

$$
\textit{RSI} = \textit{RS} \diagup \textit{(1+RS)}
$$

In [5]:
path = "/content/drive/My Drive/Colab Notebooks/Upbit_test_Folder/"

In [6]:
## 코인 거래 데이터 다운로드를 위한 함수 설정

def get_df(coin):
  df = pyupbit.get_ohlcv(coin, count=365) # 1년치 데이터 활용
  df_4h = pyupbit.get_ohlcv(coin, interval = 'minute240', count=365*6) # 240분 단위 데이터 활용

  ## 변수 생성
  ## RSI 지표 생성
  df_4h['U'] = df_4h['close'] - df_4h['open']
  df_4h['D'] = df_4h['close'] - df_4h['open']
  df_4h['U'] = np.where(df_4h['U'] > 0, df_4h['U'], 0)
  df_4h['D'] = np.where(df_4h['D'] < 0, df_4h['D'], 0)

  df_4h.index = df_4h.index.strftime("%Y%m%d")
  df.index = df.index.strftime("%Y%m%d")

  df_A = df_4h.reset_index().groupby('index').mean()[['U','D']].reset_index()
  df_A.columns = ['index','AU','AD']

  df = pd.merge(df.reset_index(), df_A, on='index', how='left')
  df['price'] = df['close']*100 / df['open']

  df['RSI'] = df['AU'] / (df['AU'] + df['AD'])
  df.drop(['open','high','low','volume','AU','AD'], axis=1, inplace=True)

  df['RSI'] = df['RSI'].apply(lambda x: df[df['RSI'] != np.inf].max()['RSI'] if x == np.inf else x)
  df['RSI'].describe()

  ## 각 변수의 과거 시점 값을 변수로 생성
  df = pd.concat([df[['index','close','price']],minmax_scaling(df, columns=['value','RSI'])], axis=1)
  lag = np.arange(1,30).tolist()

  close_cols = ['close_' + str(a) for a in lag]
  value_cols = ['value_' + str(a) for a in lag]
  RSI_cols = ['RSI_' + str(a) for a in lag]

  data = pd.DataFrame()
  data['date'] = pd.to_datetime(df['index'])
  data['name'] = coin.split('-')[1]

  data['close'] = df['close']
  data['value'] = df['value']
  data['RSI'] = df['RSI']

  for a,b,c,d in zip(lag, close_cols, value_cols, RSI_cols):
    data[b] = df['close'].shift(int(a))
    data[c] = df['value'].shift(int(a))
    data[d] = df['RSI'].shift(int(a))

  data['date'] = pd.to_datetime(data['date'])  # date 컬럼을 datetime 타입으로 변환

  time_now = datetime.datetime.now() + datetime.timedelta(days=-1)
  time_now = time_now.strftime("%Y%m%d")

  # 2일 후 종가를 예측 변수로 설정 (1일 후 종가를 예측하기 위해)
  data['target'] = df['close'].shift(-2)
  data['target'] = data['target'] * 100 / df['close'].shift(-1)  # 1일 후 종가 대비 퍼센트로 변환

  data['target'] = np.where(data['date'] <= time_now, data['target'], 100)

  data = data.dropna(axis=0)
  data.reset_index(drop=True, inplace=True)

  return data

In [7]:
# 데이터 로드 or 다운로드
coins = pyupbit.get_tickers(fiat="KRW")
data_path = path + 'coin_data.csv'

try:
  df = pd.read_csv(data_path, parse_dates=['date'])  # date 컬럼을 datetime으로 파싱
  print('데이터를 불러왔습니다')
except:
  print('데이터를 불러오지 못했습니다. 데이터를 생성합니다')
  df = pd.DataFrame()
  for coin in tqdm(coins):
    tmp = get_df(coin)
    df = pd.concat([df,tmp])
  df.reset_index(drop = True, inplace=True)
  df.to_csv(data_path,index=False)

데이터를 불러왔습니다


In [8]:
df.head()

Unnamed: 0,date,name,close,value,RSI,close_1,value_1,RSI_1,close_2,value_2,...,close_27,value_27,RSI_27,close_28,value_28,RSI_28,close_29,value_29,RSI_29,target
0,2023-05-16,BTC,36421000.0,0.025693,0.543783,36632000.0,0.036414,0.550844,36415000.0,0.017552,...,38300000.0,0.10487,0.544465,39630000.0,0.08378,0.549077,38700000.0,0.091683,0.544857,100.766042
1,2023-05-17,BTC,36700000.0,0.045357,0.543413,36421000.0,0.025693,0.543783,36632000.0,0.036414,...,37672000.0,0.090946,0.544652,38300000.0,0.10487,0.544465,39630000.0,0.08378,0.549077,98.746594
2,2023-05-18,BTC,36240000.0,0.048976,0.55081,36700000.0,0.045357,0.543413,36421000.0,0.025693,...,36337000.0,0.095838,0.543752,37672000.0,0.090946,0.544652,38300000.0,0.10487,0.544465,99.768212
3,2023-05-19,BTC,36156000.0,0.034127,0.543432,36240000.0,0.048976,0.55081,36700000.0,0.045357,...,37182000.0,0.054829,0.542248,36337000.0,0.095838,0.543752,37672000.0,0.090946,0.544652,100.633367
4,2023-05-20,BTC,36385000.0,0.009158,0.553846,36156000.0,0.034127,0.543432,36240000.0,0.048976,...,36802000.0,0.023325,0.535961,37182000.0,0.054829,0.542248,36337000.0,0.095838,0.543752,99.082039


In [9]:
## 학습, 예측 셋 구분 (5일 전 데이터까지를 테스트 셋으로)
time_test = datetime.datetime.now() + datetime.timedelta(days=-5)
time_test = pd.to_datetime(time_test.strftime("%Y%m%d"))
time_now = datetime.datetime.now() + datetime.timedelta(days=-1)
time_now = pd.to_datetime(time_now.strftime("%Y%m%d"))

Train = df[df['date'] < time_test]
Test = df[(df['date'] >= time_test) & (df['date'] <= time_now)]  # 테스트 셋은 5일 전부터 어제까지의 데이터

Train.reset_index(drop=True, inplace=True)
Test.reset_index(drop=True, inplace=True)


In [10]:
## 활용 변수 셋팅
cols = df.columns.tolist()
cols.remove('target')
cols.remove('name')
cols.remove('date')

## LGBM의 K-Fold 적용
train_set, valid_set = train_test_split(Train,train_size=0.85,random_state=42)
X_train = train_set[cols]
y_train = train_set['target']
X_valid = valid_set[cols]
y_valid = valid_set['target']

## 5 Fold Cross Validation
y_cat = pd.cut(y_train, 5, labels=range(5))
skf = KFold(5)

## 파라미터 셋팅
params = {
            'learning_rate' : 0.05,
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'mae',
            'sub_row' : 0.75,
            'lambda_l2' : 0.1
        }

preds = []
preds_test = []

i = 1

In [11]:
## 실제 학습 진행
for tr_id, val_id in skf.split(X_train, y_cat) :
    X_tr = X_train[cols].iloc[tr_id]
    y_tr = y_train.iloc[tr_id]

    train_x, valid_x, train_y, valid_y = train_test_split(X_tr, y_tr, train_size=0.8,random_state=42)
    train_ds = lgb.Dataset(train_x, label=train_y)
    val_ds = lgb.Dataset(valid_x, label=valid_y)

    print('{}번째 학습'.format(i))
    model = lgb.train(params,
                  train_ds,
                  2000,
                  val_ds,
                  verbose_eval = 100,
                  early_stopping_rounds = 100
                 )
    pred = model.predict(X_valid[cols])
    preds.append(pred)
    i += 1

    if not Test.empty:  # Test 데이터프레임이 비어있지 않은 경우에만 예측
        pred_test = model.predict(Test[cols])
        preds_test.append(pred_test)
    else:
        print("Test 데이터프레임이 비어있어 예측을 건너뜁니다.")

1번째 학습
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22950
[LightGBM] [Info] Number of data points in the train set: 20105, number of used features: 90
[LightGBM] [Info] Start training from score 4.608267
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 2.91823
Early stopping, best iteration is:
[10]	valid_0's l1: 2.9046
2번째 학습
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22950
[LightGBM] [Info] Number of data points in the train set: 20105, number of used features: 90
[LightGBM] [Info] Start training from score 4.608168
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 2.83614
Early stopping, best iteration is:
[56]	valid_0's l1: 2.83008
3번째 학습
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22950
[LightGBM] [Info] Number of data points in the train set: 20105, number of used features: 90
[LightGBM] [

In [14]:
## 예측값 생성 및 점수 확인
model_pred = np.mean(preds, axis = 0)
print(mean_absolute_error(y_valid, model_pred))

# 최종 예측된 데이터셋 생성
final_pred = np.mean(preds_test, axis=0)

final = pd.DataFrame()
final['coin'] = Test['name']
final['date'] = Test['date'] + pd.Timedelta(days=1)  # 1일 후의 날짜로 조정
final['preds'] = final_pred

# 결과 저장
time_now = datetime.datetime.now()
time_now = time_now.strftime("%Y%m%d")
result_path = path+ time_now + 'result.csv'
final.to_csv(result_path, index=False)
print(f'예측 결과가 {result_path}에 저장되었습니다')

# 최종 예측된 데이터셋 생성
final_pred = np.mean(preds_test, axis=0)

final = pd.DataFrame()
final['coin'] = Test['name']
final['date'] = Test['date'] + pd.Timedelta(days=2)  # 내일의 날짜로 조정
final['preds'] = final_pred

# 내일의 종가 예측 결과 저장
time_now = datetime.datetime.now()
time_now = time_now.strftime("%Y%m%d")
result_path = path + time_now + '_predict.csv'  # 파일명을 '_predict.csv'로 변경
final.to_csv(result_path, index=False)
print(f'내일 종가 예측 결과가 {result_path}에 저장되었습니다')

2.8518814711221196
예측 결과가 /content/drive/My Drive/Colab Notebooks/Upbit_test_Folder/20240415result.csv에 저장되었습니다
내일 종가 예측 결과가 /content/drive/My Drive/Colab Notebooks/Upbit_test_Folder/20240415_predict.csv에 저장되었습니다
