In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import random

import warnings
warnings.filterwarnings("ignore")

In [4]:
np.random.seed(2022)
random.seed(2022)
os.environ['PYTHONHASHSEED'] = '0'

In [5]:
data = pd.read_csv('./bitstampUSD.csv')

In [6]:
data.head()
# Weighted_Price값이 예측할 값

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
1,1325317980,,,,,,,
2,1325318040,,,,,,,
3,1325318100,,,,,,,
4,1325318160,,,,,,,


In [7]:
data.tail()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
4857372,1617148560,58714.31,58714.31,58686.0,58686.0,1.384487,81259.372187,58692.753339
4857373,1617148620,58683.97,58693.43,58683.97,58685.81,7.294848,428158.14664,58693.226508
4857374,1617148680,58693.43,58723.84,58693.43,58723.84,1.705682,100117.07037,58696.198496
4857375,1617148740,58742.18,58770.38,58742.18,58760.59,0.720415,42332.958633,58761.866202
4857376,1617148800,58767.75,58778.18,58755.97,58778.18,2.712831,159417.751,58764.349363


In [8]:
data.shape

(4857377, 8)

In [9]:
data.describe()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
count,4857377.0,3613769.0,3613769.0,3613769.0,3613769.0,3613769.0,3613769.0,3613769.0
mean,1471301000.0,6009.024,6013.357,6004.488,6009.014,9.323249,41762.84,6008.935
std,84280190.0,8996.247,9003.521,8988.778,8996.36,30.54989,151824.8,8995.992
min,1325318000.0,3.8,3.8,1.5,1.5,0.0,0.0,3.8
25%,1398179000.0,443.86,444.0,443.52,443.86,0.4097759,452.1422,443.8306
50%,1471428000.0,3596.97,3598.19,3595.62,3597.0,1.979811,3810.124,3596.804
75%,1544288000.0,8627.27,8632.98,8621.09,8627.16,7.278216,25698.21,8627.637
max,1617149000.0,61763.56,61781.83,61673.55,61781.8,5853.852,13900670.0,61716.21


In [10]:
# Data Preprocess
data.isnull().sum()

Timestamp                  0
Open                 1243608
High                 1243608
Low                  1243608
Close                1243608
Volume_(BTC)         1243608
Volume_(Currency)    1243608
Weighted_Price       1243608
dtype: int64

In [11]:
# 데이터를 초단위로 나눔
data.index = pd.to_datetime(data['Timestamp'], unit='s')
data.head()

Unnamed: 0_level_0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-12-31 07:52:00,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
2011-12-31 07:53:00,1325317980,,,,,,,
2011-12-31 07:54:00,1325318040,,,,,,,
2011-12-31 07:55:00,1325318100,,,,,,,
2011-12-31 07:56:00,1325318160,,,,,,,


In [13]:
# Timestamp 제거
data = data.drop(['Timestamp'], axis=1)
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31 07:52:00,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
2011-12-31 07:53:00,,,,,,,
2011-12-31 07:54:00,,,,,,,
2011-12-31 07:55:00,,,,,,,
2011-12-31 07:56:00,,,,,,,


In [14]:
# 데이터를 일단위로 나눔
data = data.resample('D').mean()
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31,4.465,4.4825,4.465,4.4825,23.82947,106.330084,4.471603
2012-01-01,4.806667,4.806667,4.806667,4.806667,7.200667,35.25972,4.806667
2012-01-02,5.0,5.0,5.0,5.0,19.048,95.24,5.0
2012-01-03,5.2525,5.2525,5.2525,5.2525,11.00466,58.100651,5.2525
2012-01-04,5.2,5.223333,5.2,5.223333,11.914807,63.119577,5.208159


In [16]:
# 결측치 제거
data = data.dropna()
data.isnull().sum()

Open                 0
High                 0
Low                  0
Close                0
Volume_(BTC)         0
Volume_(Currency)    0
Weighted_Price       0
dtype: int64

In [18]:
# Scaling
from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()

data['open'] = rob_scaler.fit_transform(data['Open'].values.reshape(-1,1))
data['high'] = rob_scaler.fit_transform(data['High'].values.reshape(-1,1))
data['low'] = rob_scaler.fit_transform(data['Low'].values.reshape(-1,1))
data['close'] = rob_scaler.fit_transform(data['Close'].values.reshape(-1,1))
data['volume_(BTC)'] = rob_scaler.fit_transform(data['Volume_(BTC)'].values.reshape(-1,1))
data['volume_(Currency)'] = rob_scaler.fit_transform(data['Volume_(Currency)'].values.reshape(-1,1))
data['weighted_Price'] = rob_scaler.fit_transform(data['Weighted_Price'].values.reshape(-1,1))

In [19]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price,open,high,low,close,volume_(BTC),volume_(Currency),weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-12-31,4.465,4.4825,4.465,4.4825,23.82947,106.330084,4.471603,-0.099101,-0.09906,-0.099124,-0.099099,1.919561,-0.196897,-0.099097
2012-01-01,4.806667,4.806667,4.806667,4.806667,7.200667,35.25972,4.806667,-0.099053,-0.099014,-0.099075,-0.099052,-0.049931,-0.198978,-0.099049
2012-01-02,5.0,5.0,5.0,5.0,19.048,95.24,5.0,-0.099025,-0.098987,-0.099047,-0.099025,1.35325,-0.197222,-0.099022
2012-01-03,5.2525,5.2525,5.2525,5.2525,11.00466,58.100651,5.2525,-0.098989,-0.09895,-0.099011,-0.098989,0.400609,-0.198309,-0.098986
2012-01-04,5.2,5.223333,5.2,5.223333,11.914807,63.119577,5.208159,-0.098996,-0.098955,-0.099019,-0.098993,0.508405,-0.198162,-0.098992


In [21]:
data.drop(['Open', 'High', 'Low', 'Close', 'Volume_(BTC)', 'Volume_(Currency)', 'Weighted_Price'], axis=1, inplace=True)

In [22]:
data.head()

Unnamed: 0_level_0,open,high,low,close,volume_(BTC),volume_(Currency),weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31,-0.099101,-0.09906,-0.099124,-0.099099,1.919561,-0.196897,-0.099097
2012-01-01,-0.099053,-0.099014,-0.099075,-0.099052,-0.049931,-0.198978,-0.099049
2012-01-02,-0.099025,-0.098987,-0.099047,-0.099025,1.35325,-0.197222,-0.099022
2012-01-03,-0.098989,-0.09895,-0.099011,-0.098989,0.400609,-0.198309,-0.098986
2012-01-04,-0.098996,-0.098955,-0.099019,-0.098993,0.508405,-0.198162,-0.098992


In [23]:
data.tail()

Unnamed: 0_level_0,open,high,low,close,volume_(BTC),volume_(Currency),weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-03-27,7.787505,7.785233,7.789185,7.788185,-0.68675,2.753435,7.78735
2021-03-28,7.879015,7.876344,7.880941,7.879589,-0.731276,2.160531,7.878749
2021-03-29,8.033533,8.03177,8.034637,8.034019,-0.46065,6.057749,8.033228
2021-03-30,8.2383,8.235355,8.240368,8.238912,-0.622779,3.846771,8.237991
2021-03-31,8.298311,8.293361,8.302144,8.300195,-0.581464,4.467015,8.297642


In [24]:
# Train은 이전 시점, Test는 미래 시점 data
train = data.iloc[:-120]
test = data.iloc[-120:]

In [25]:
train = train.sample(frac=1)
test = test.sample(frac=1)

In [26]:
X_train = train.drop(['weighted_Price'], axis=1)
y_train = train['weighted_Price']

X_test = test.drop(['weighted_Price'], axis=1)
y_test = test['weighted_Price']

In [27]:
# Pandas to Numpy
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [34]:
from sklearn.metrics import mean_squared_error

In [35]:
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10)
random_forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=10)

In [36]:
y_pred = random_forest.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

3.423366278484888


In [37]:
knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=2)

In [38]:
y_pred=knn.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print('rmse:',rmse)

rmse: 3.516800805123711


In [None]:
# Random Forest가 더 좋은 성능을 보임