In [115]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [116]:
stock = pd.read_csv('399300.csv')

In [117]:
############ Processing Cell ############

stock['Date'] = pd.to_datetime(stock['Date'], format='%d/%m/%Y')
stock = stock.sort_values(by='Date')

stock['close_5'] = stock['Close'].rolling(window=5).mean()
stock['close_30'] = stock['Close'].rolling(window=30).mean()
#stock['close_365'] = stock['Close'].rolling(window=365).mean()

stock['volume_5'] = stock['Volume'].rolling(window=5).mean()
stock['volume_30'] = stock['Volume'].rolling(window=30).mean()
#stock['volume_365'] = stock['Volume'].rolling(window=365).mean()

stock['std_v5'] = stock['Volume'].rolling(window=5).std()
stock['std_v30'] = stock['Volume'].rolling(window=30).std()
#stock['std_v365'] = stock['Volume'].rolling(window=365).std()

stock['amp_5'] = stock['Range'].rolling(window=5).mean()
stock['amp_30'] = stock['Range'].rolling(window=30).mean()
#stock['amp_365'] = stock['Range'].rolling(window=365).mean()

stock['weekdays'] = stock['Date'].dt.dayofweek
stock['months'] = stock['Date'].dt.month
weekday_dummies = pd.get_dummies(stock['weekdays'], prefix='weekdays')
month_dummies = pd.get_dummies(stock['months'], prefix='months')
stock = pd.concat([stock, weekday_dummies, month_dummies], axis=1)
day_list = list(weekday_dummies.columns.values) + list(month_dummies.columns.values)

feature_cols = ['close_5', 'close_30',
               'volume_5', 'volume_30',
               'std_v5', 'std_v30',
               'amp_5', 'amp_30',]

# 保留一下错位前的数据，需要用这个的最后一行预测明天的价格
stock1 = stock
stock[feature_cols] = stock[feature_cols].shift(1)

stock = stock.dropna(axis=0)

train = stock[stock['Date'] < datetime(year=2015, month=1, day=1)]
test = stock[stock['Date'] >= datetime(year=2015, month=1, day=1)]
print(stock.shape)
print(day_list)

(3109, 36)
['weekdays_0.0', 'weekdays_1.0', 'weekdays_2.0', 'weekdays_3.0', 'weekdays_4.0', 'months_1.0', 'months_2.0', 'months_3.0', 'months_4.0', 'months_5.0', 'months_6.0', 'months_7.0', 'months_8.0', 'months_9.0', 'months_10.0', 'months_11.0', 'months_12.0']


In [118]:
lr = LinearRegression()

feature = ['close_5', 'close_30',
               'volume_5', 'volume_30',
               'std_v5', 'std_v30',
               'amp_5', 'amp_30',] + day_list
target = 'Close'

lr.fit(train[feature], train[target])
prediction = lr.predict(test[feature])
mae = mean_absolute_error(test[target], prediction)
error_in_perc = (mae / test[target].mean()) * 100

print('Mean Absolute Error for using closing price and volumes:', mae)
print('Error in percentage is: %f%s' % (error_in_perc,'%'))

Mean Absolute Error for using closing price and volumes: 48.9987566441
Error in percentage is: 1.361547%


In [124]:
lr1 = LinearRegression()

feature = ['close_5', 'close_30',
               'volume_5', 'volume_30',
               'std_v5', 'std_v30',
               'amp_5', 'amp_30',] + day_list
target = 'Close'

# 这里的最后一行就是从今日起往前五日的平均数据
# 按错位后数据喂养的模型使用这个最后一行预测出的是明日的价格
stock1 = stock1.dropna()
last_line = stock1.iloc[-1:]

lr1.fit(stock[feature], stock[target])
prediction = lr1.predict(last_line[feature])

print(prediction)

[ 4016.30665877]
