In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense 
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau
import requests
from bs4 import BeautifulSoup

In [None]:
#삼성 코드 005930
#date 부터 오늘까지 주가 긁어오는 함수
def parse_to_df(code,date):
    page = 1
    while True: 
        url = f'https://finance.naver.com/item/sise_day.nhn?code={code}&page={page}'
        req = requests.get(url) 
        #html 구문에 맞게 텍스트를 읽는다
        bs = BeautifulSoup(req.text, "html.parser")
        print(f'page {page}')
        try : 
            #read_html은 html에서 표를 가져온다
            #0 번째 인덱스를 넣는게 의미가 없어 보일 수 있어도 
            #dataframe이 리스트에 감싸져 나오기 때문에 한 요소만 선택
            table = pd.read_html(str(bs.find('table',{'class':"type2"})))[0] 
            
            #첫페이지일 경우 table을 깊은 복사
            if page == 1:
                df = table.dropna().copy()    
                print('copy dataframe')
            #table을 df에 추가
            else : 
                df = pd.concat([df,table.dropna()])  
                print('concate dataframe')
        except Exception as e :
            print(e)   

        #breaking point date ~ 오늘까지만 남김
        if int(str(df.iloc[-1][0]).replace('.',''))  < int(date) :
            date = date[:4]+'.'+date[4:6]+'.'+date[6:]
            df = df[df['날짜']>= date]  
            print('break')
            break
        page += 1  
    #영어로 column 명 바꿈
    df.columns = ["Date","Close", 'YtT','Market Cap','High','Low','Volume' ]
    return df 

date,code =  input('시작일) YYYYMMDD   code:').split()
df = parse_to_df(code,date)

In [None]:
df.tail()

In [None]:
#df 를 csv로 저장
df.to_csv(f'./{code}_crawling.csv', sep=',',index=False)

In [None]:
data = pd.read_csv(f'./{code}_crawling.csv')
data = data[::-1]
data.reset_index(drop=True, inplace=True)
df = data

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scale_cols = ['Close', 'YtT', 'Market Cap', 'High', 'Low', 'Volume']
df_scaled = scaler.fit_transform(df[scale_cols])

df_scaled = pd.DataFrame(df_scaled)
df_scaled.columns = scale_cols

print(df_scaled)

TEST_SIZE = 200
train = df_scaled[:-TEST_SIZE]
test = df_scaled[-TEST_SIZE:]

In [None]:
def make_dataset(data, label, window_size=20):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

In [None]:
feature_cols = ['YtT', 'Market Cap', 'High', 'Low', 'Volume']
label_cols = ['Close']

train_feature = train[feature_cols]
train_label = train[label_cols]

# train dataset
train_feature, train_label = make_dataset(train_feature, train_label, 20)

# train, validation set 생성
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)

print(x_train.shape,y_train.shape, x_valid.shape, y_valid.shape)

test_feature = test[feature_cols]
test_label = test[label_cols]

x_test, y_test = make_dataset(test_feature, test_label, 20)

print(x_test.shape, y_test.shape)

In [None]:
model = Sequential()

#seq_len개 데이터가 들어감 각 seq 에서 아웃풋 냄
model.add(LSTM(128, return_sequences=True, input_shape=(20,5)))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1,activation='linear'))

model.compile(loss='mse', optimizer='adam')

model.summary()

In [None]:
from datetime import datetime

start_time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')

model.fit(x_train, y_train,
    validation_data=(x_valid,y_valid),
    batch_size=32,
    epochs=20,
    callbacks=[
        TensorBoard(log_dir='logs/%s' % (start_time)),
        #epoch마다 가중치를 저장하는데 /val_loss 가 좋을 때만
        ModelCheckpoint('./models/%s_eth.h5' % (start_time), monitor='val_loss', verbose=1, save_best_only=True, mode='auto'),
        #AttributeError: 'Sequential' object has no attribute '_get_distribution_strategy'
        #val_loss 기준 / callback되면 학습률 0.2 / patience epoch 6 동안 기다렸다가 callback
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=2, mode='auto')
])

In [None]:
loss = model.evaluate(x_test,y_test)

In [None]:
pred = model.predict(x_test)

fig = plt.figure(facecolor='white', figsize=(20, 10))
ax = fig.add_subplot(111)
ax.plot(y_test, label='True')
ax.plot(pred, label='Prediction')
ax.legend()
plt.show()