# Linear Model by Instrument

start w/ a single stock, fit a linear model to several days price activity

In [None]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import numpy as np

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [None]:
fdir = '../data/60_second/'
files = glob.glob(fdir + '*')
def strip_ticker(s, fdir):
    return s.strip(fdir).split('_')[0].strip('\\')
tickers = list(set([strip_ticker(x, fdir) for x in files]))
tickers = {key:[] for key in tickers}
for file in files:
    tickers[strip_ticker(file, fdir)].append(file)

In [None]:
results=[]
for key, items in tickers.items():
    print('Starting {}'.format(key))
    d = pd.DataFrame()
    for item in items:
        df = pd.read_csv(item)
        d = pd.concat([d, df])
    if d.shape[0] < 1173:
        # skip if there is less than 1 day of data
        print('No records for {}'.format(key))
        results.append((key, None))
        continue
        
    d.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)
    d.reset_index(drop=True,inplace=True)
    d.DateTime=[pd.Timestamp(x) for x in d.DateTime]
    d.drop_duplicates(inplace=True)
    d['Date']=[x.date() for x in d.DateTime]
    d['Time']=[x.time() for x in d.DateTime]
    d['Time']=[str(x) for x in d.Time]
    d['scale_close']=d.groupby('Date')['Close'].apply(lambda x: (x-min(x))/(max(x)-min(x)))
    timeSeq = [x for x in range(d.Time.unique().shape[0])]
    timeDf = pd.DataFrame({'Time': d.Time.unique(),
                       'timeSeq': timeSeq})
    d=d.merge(timeDf,on='Time', how='left')
    degree=3
    model = make_pipeline(PolynomialFeatures(degree), Ridge())
    model.fit(d.timeSeq.values.reshape(-1,1), d.scale_close)
    yhat=[model.predict(x)[0] for x in d.timeSeq.values]
    actuals=d.scale_close
    mse=mean_squared_error(actuals, yhat)
    results.append((key,mse))
    print('{} complete'.format(key))


In [None]:
pd.DataFrame(results, columns=['ticker','mse']).sort_values('mse',ascending=True)

In [None]:
# just get files w/ AMD in the name
tick = 'PSX_'
amd_file_paths = [x for x in files if tick in x]

In [None]:
# load all AMD stock into a dataframe
amd_df = pd.DataFrame()
for x in amd_file_paths:
    d = pd.read_csv(x)
    amd_df = pd.concat([amd_df, d])
amd_df.rename(columns={'Unnamed: 0': 'DateTime'}, inplace=True)
amd_df.reset_index(drop=True,inplace=True)
amd_df.DateTime=[pd.Timestamp(x) for x in amd_df.DateTime]
amd_df.drop_duplicates(inplace=True)
amd_df.head()

In [None]:
amd_df.shape

In [None]:
# make date and time of time features
amd_df['Date']=[x.date() for x in amd_df.DateTime]
amd_df['Time']=[x.time() for x in amd_df.DateTime]
amd_df['Time']=[str(x) for x in amd_df.Time]

In [None]:
amd_df['scale_close']=amd_df.groupby('Date')['Close'].apply(lambda x: (x-min(x))/(max(x)-min(x)))

In [None]:
# create sequence for each day ie 0-390
timeSeq = [x for x in range(amd_df.Time.unique().shape[0])]
timeDf = pd.DataFrame({'Time': amd_df.Time.unique(),
                       'timeSeq': timeSeq})

In [None]:
amd_df=amd_df.merge(timeDf,on='Time', how='left')

In [None]:
#amd_df.to_csv('../../amd_df.csv', index=None)

In [None]:

degree=7
model = make_pipeline(PolynomialFeatures(degree), Ridge())
model.fit(amd_df.timeSeq.values.reshape(-1,1), amd_df.scale_close)

In [None]:
yhat=model.predict(np.array(timeSeq).reshape(-1,1))

In [None]:
plt.scatter(amd_df.timeSeq, amd_df.scale_close, s=1)
plt.plot(yhat, color='red')
plt.show()

In [None]:
yhat=[model.predict(x)[0] for x in amd_df.timeSeq.values]

In [None]:
results = pd.DataFrame({'timeSeq':amd_df.timeSeq.values,
                        'actuals':amd_df.scale_close,
                        'predicted':yhat})

In [None]:

mean_squared_error(results.actuals, results.predicted)