In [35]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV,LassoLarsCV,Lasso
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report,mean_squared_error
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot as plt
import warnings
from sklearn.exceptions import ConvergenceWarning
import matplotlib.pyplot as plt

import sys
from scipy.sparse import dia_matrix
import scipy.sparse.linalg
import math
from pca import pca
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from fancyimpute import KNN
from sklearn.impute import KNNImputer
warnings.filterwarnings("ignore",category=ConvergenceWarning)

In [36]:
# small test 
def knnpca(df, k=5, n_components = 3):
    features = list(df.columns[0:])
    # print(len(features))
    x = df.loc[:,features].values
    x = StandardScaler().fit_transform(x)
    imputer = KNN(k, verbose=False)
    x = imputer.fit_transform(x)
    pca = PCA(n_components)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents
                , columns = ['PC{}'.format(i) for i in range(1, n_components+1)])
    
        # Get the explained variance for each component
    explained_var = pca.explained_variance_ratio_

    # Calculate the cumulative explained variance
    cumulative_var = np.cumsum(explained_var)

    # Plot the cumulative explained variance
    # plt.plot(range(1, n_components+1), cumulative_var, '-o')
    # plt.xlabel('Number of components')
    # plt.ylabel('Cumulative explained variance')
    # plt.title('Cumulative Explained Variance')
    # plt.show()
    
    return principalDf

In [37]:
# test for AAPL 

def X_y(freq,stock):
    #freq = 'Daily'
    #stock = 'AAPL'
    price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
    price = price.sort_values(by='Date').reset_index(drop=True)
    price = price.loc[price.Date>='2010-01-01']
    predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv',index_col='Date')
    NLP = pd.read_csv('../predictors/NLP/Daily/NYT_macro_SA.csv').set_index(['Date'])
    predictors = predictors.merge(NLP,left_index=True,right_index=True,how='left')
    predictors = predictors.loc[predictors.index<='2019-12-31',:]
    # predictors.fillna(0,inplace=True)
    predictors = knnpca(predictors, k=5, n_components = 30)

    X = predictors.values[:,]
    y = price.adjusted_close.shift(-1).values[:len(X)]

    return X,y

In [38]:

def mse(y_pred, y_true):
    n = len(y_pred)
    return np.sqrt(np.sum(np.square(y_pred - y_true)))/n

def find_split(X,y):
    scores = []
    for i in range(3,15):
        tscv = TimeSeriesSplit(n_splits=i)
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)
        pipe = make_pipeline(MinMaxScaler(),Lasso())

        sfs = SequentialFeatureSelector(pipe,n_jobs = -1,n_features_to_select='auto',scoring='neg_root_mean_squared_error')
        sfs.fit(X_train,y_train)
        X_train = sfs.transform(X_train)
        pipmodel = make_pipeline(MinMaxScaler(),LassoCV(cv=tscv))
        pipmodel.fit(X_train,y_train)
        X_test =  sfs.transform(X_test)
        y_pred = pipmodel.predict(X_train)
        score = mean_squared_error(y_pred, y_train)
        print(score)
        scores.append(score)
    n_split = scores.index(min(scores))+2
    return n_split

In [39]:
def lasso(X,y,n_split,stock):
    tscv = TimeSeriesSplit(n_splits=n_split)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)
    pipe = make_pipeline(MinMaxScaler(),Lasso())
    tscv = TimeSeriesSplit(n_splits=10)
    sfs = SequentialFeatureSelector(pipe,n_jobs = -1,n_features_to_select=15,scoring='neg_root_mean_squared_error')
    sfs.fit(X_train,y_train)
    X_train = sfs.transform(X_train)
    pipmodel = make_pipeline(MinMaxScaler(),LassoCV(cv=tscv))
    pipmodel.fit(X_train,y_train)
    X_test =  sfs.transform(X_test)
    y_pred = pipmodel.predict(X_test)
    print(mse(y_pred, y_test), mse(y_train, pipmodel.predict(X_train)))
    plt.plot(y_pred,label="Predictions")
    plt.plot(y_test,label="Actual Data")
    plt.title(stock)
    plt.legend()

In [41]:
def main():
    stock_list = ['AAPL','MSFT','GOOG','AMZN','NVDA','BRK-B','TSLA','META','JNJ','V']
    freq_list = ['Daily','Weekly','Monthly']
    for stock in stock_list:
        for freq in freq_list:
            X,y = X_y(freq,stock)
            n_split = find_split(X,y)
            lasso(X,y,n_split,stock)
            plt.savefig('../plots/'+freq+'/'+stock+'.png')
            plt.clf()
            plt.close()

main()

10.309476219362908
9.899392169543614
6.370326856008958
10.076004518487911
6.370326856008958
10.076004518487911
9.827938469683549
6.507049760939347
7.054554544103045
6.121406433201723
6.88932253299395
6.61859389352443
0.4691850414751191 0.058722713271969235
8.983515377331482
5.18643693314332
5.203542141305582
9.378767812754916
5.18643693314332
5.9970874636911935
8.983515377331482
5.308629899397
9.167374451817562
5.196940021127629
5.628365010222687
5.18643693314332
0.9357221223154414 0.11666992558888252
2.7244396480279525
2.7244396480279525
3.7603546074736625
2.7244396480279525
2.7244396480279525
3.5242941356452726
2.7244396480279525
2.748530580791357
2.7244396480279525
2.7244396480279525
2.7244396480279525
2.7244396480279525
2.2871456798752634 0.1747547672467321
6.1444537415129625
6.256887325520296
6.175037474430267
6.1444537415129625
6.1444537415129625
6.169750016884463
6.196205130301815
6.1444537415129625
6.1444537415129625
6.1444537415129625
6.1444537415129625
6.1444537415129625
0.70

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
41.233211773011085
0.45692926111569016 0.14782150928687277


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
77.20058691281886
2.366488927989071 0.4449159728009118


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


29.945198708571862
28.78078457709191
28.78078457709191
28.78078457709191
28.78078457709191
29.656792778138904
28.78078457709191
28.78078457709191
28.90694717509202
28.78078457709191
28.78078457709191
28.78078457709191
1.8016573382647778 0.5654966801856764


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
1.12129129028683
0.07114643228601997 0.024993495424415278


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
1.2599004609711224
0.2668020520479658 0.05827484977093573


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.6818935192532816
1.0094266813718624 0.14066620979983208


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


16.836443689777155
16.93976740561832
17.459368588516433
16.998428339341856
16.836443689777155
17.034501842759134
17.119213377691572
16.895395159790553
16.895395159790553
17.034501842759134
16.998428339341856
16.836443689777155
1.160928917063857 0.10843158167809151


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


20.412540699096446
20.649792600516577
21.420047379910493
20.606583218294233
20.412540699096446
20.569002204735728
20.412540699096446
20.507888290385072
20.606583218294233
20.412540699096446
20.606583218294233
20.412540699096446
3.008461253055982 0.262773912182803


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


34.63689550998857
34.78292957301602
34.63689550998857
34.63689550998857
35.86939398200525
34.66050286825902
36.4399008995745
36.4399008995745
34.63689550998857
34.63689550998857
34.85012409471178
34.81418627035932
4.422084287739247 0.7320388620708452
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
5.692477984485646
0.3479962674123735 0.05492432518304383
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
6.668661437528693
0.7050693236044423 0.1307636489667492
8.027528750638302
8.027528750638302
8.06075745474195
8.027528750638302
8.027528750638302
8.027528750638302
8.027528750638302
8.027528750638302
8.027528750638302
8.027528750638302
8.027528750638302
8.03132611791893
2.068250788308497 0.298654

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


16.62522585064126
16.62522585064126
72.71085864135362
16.62522585064126
16.62522585064126
16.62522585064126
23.03275275774067
16.62522585064126
16.62522585064126
16.62522585064126
16.62522585064126
16.62522585064126
1.2551776999709878 0.09386373398058818


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


20.009682779536863
46.19641347975612
48.189903306082385
20.009682779536863
42.95457477865171
48.189903306082385
42.95457477865171
20.009682779536863
46.19641347975612
48.189903306082385
20.009682779536863
20.009682779536863
3.389519429146846 0.22651021814013028


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


15.96055394740024
15.936442798481474
28.195658127189937
15.936442798481474
15.936442798481474
27.166026971087952
15.936442798481474
15.936442798481474
28.195658127189937
15.936442798481474
15.936442798481474
15.936442798481474
8.138016177754706 0.4207987483938066
