# SVM

In [115]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit

In [116]:
tech_list = ['SMA','EMA','STOCH_k','STOCK_d','RSI','MFI','SAR','AD','MACD','MACD_Signal','MACD_Histo','VWAP','SPY','NDAQ','PC1','PC2']
funda_list = ['pcf','PEG_trailing','dpr','npm','gpm','roa','roe','capital_ratio','de_ratio','cash_ratio','curr_ratio','inv_turn','pay_turn','sale_nwc','rd_sale','accrual']
macro_list = ['gdpr1,gdpr2,cpi,bond20yr,bond30yr,fedfunds,cpir,wpir,unemp,employ']

In [133]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SequentialFeatureSelector


freq = 'Daily'
stock = 'AAPL'
price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
price = price.loc[price.Date<='2019-12-31',:][::-1]
y = price.adjusted_close.shift(-1).values[:-1]
predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv')
NLP = pd.read_csv('../predictors/NLP/Daily/NYT_macro_SA.csv')
predictors = pd.merge(predictors,NLP,how='left',on=['Date'])
predictors = predictors.loc[predictors.Date <= '2019-12-31',:]
predictors.set_index('Date',inplace=True)
predictors.fillna(0,inplace=True)
X = predictors.values[:-1]

## Sigmoid Kernel

In [134]:
cv = 5
tscv = TimeSeriesSplit(n_splits=cv)
total_score = []
for i in range(5,50,4):
    scores = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipe = make_pipeline(MinMaxScaler(),SVR(kernel='sigmoid'))
        sfs = SequentialFeatureSelector(pipe,n_jobs = -1,n_features_to_select=i,scoring='neg_root_mean_squared_error')
        sfs.fit(X_train,y_train)
        X_train = sfs.transform(X_train)
        pipe.fit(X_train,y_train)
        X_test =  sfs.transform(X_test)
        y_pred = pipe.predict(X_test)
        scores.append(mean_squared_error(y_pred,y_test,squared=False))
    average_score = np.mean(scores)
    print(average_score)
    total_score.append(average_score)
print(stock,'/',freq,'\n','MSE', total_score)

24.279186294048962
21.138815937001702
28.201315838991214
18.693168668137037
15.312345751859699
17.908822360119814
13.885620427231917
12.080196243788881
12.553856098820848
12.694523075707062
13.478445680504498
14.22016245953426
AAPL / Daily 
 MSE [24.279186294048962, 21.138815937001702, 28.201315838991214, 18.693168668137037, 15.312345751859699, 17.908822360119814, 13.885620427231917, 12.080196243788881, 12.553856098820848, 12.694523075707062, 13.478445680504498, 14.22016245953426]


In [135]:
i = 0
for j in range(5,50,4):
    print(j,total_score[i])
    i += 1

5 24.279186294048962
9 21.138815937001702
13 28.201315838991214
17 18.693168668137037
21 15.312345751859699
25 17.908822360119814
29 13.885620427231917
33 12.080196243788881
37 12.553856098820848
41 12.694523075707062
45 13.478445680504498
49 14.22016245953426


In [136]:
cv = 5
tscv = TimeSeriesSplit(n_splits=cv)
total_score = []
for i in range(25,41,2):
    scores = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipe = make_pipeline(MinMaxScaler(),SVR(kernel='sigmoid'))
        sfs = SequentialFeatureSelector(pipe,n_jobs = -1,n_features_to_select=i,scoring='neg_root_mean_squared_error')
        sfs.fit(X_train,y_train)
        X_train = sfs.transform(X_train)
        pipe.fit(X_train,y_train)
        X_test =  sfs.transform(X_test)
        y_pred = pipe.predict(X_test)
        scores.append(mean_squared_error(y_pred,y_test,squared=False))
    average_score = np.mean(scores)
    print(average_score)
    total_score.append(average_score)
print(stock,'/',freq,'\n','MSE', total_score)

17.908822360119814
15.024988216268628
13.885620427231917
12.764415344962032
12.080196243788881
12.327097193550049
12.553856098820848
12.437878046433374
AAPL / Daily 
 MSE [17.908822360119814, 15.024988216268628, 13.885620427231917, 12.764415344962032, 12.080196243788881, 12.327097193550049, 12.553856098820848, 12.437878046433374]


In [137]:
from sklearn.linear_model import Lasso
cv = 5
tscv = TimeSeriesSplit(n_splits=cv)
total_score = []
for i in range(5,50,4):
    scores = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipe = make_pipeline(MinMaxScaler(),Lasso())
        sfs = SequentialFeatureSelector(pipe,n_jobs = -1,n_features_to_select=i,scoring='neg_root_mean_squared_error')
        sfs.fit(X_train,y_train)
        X_train = sfs.transform(X_train)

        pipmodel = make_pipeline(MinMaxScaler(),SVR(kernel='sigmoid'))
        pipmodel.fit(X_train,y_train)
        X_test =  sfs.transform(X_test)
        y_pred = pipmodel.predict(X_test)
        scores.append(mean_squared_error(y_pred,y_test,squared=False))
    average_score = np.mean(scores)
    print(average_score)
    total_score.append(average_score)
print(stock,'/',freq,'\n','MSE', total_score)

47.29698729314522
14.91884289207168
15.244273622675445
15.288818673250892
14.947045025254912
14.135154288491048
13.508278319300526
15.128049596282853
14.199183953297894
14.219249237105908
14.4037298160903
14.535381872538021
AAPL / Daily 
 MSE [47.29698729314522, 14.91884289207168, 15.244273622675445, 15.288818673250892, 14.947045025254912, 14.135154288491048, 13.508278319300526, 15.128049596282853, 14.199183953297894, 14.219249237105908, 14.4037298160903, 14.535381872538021]


## Linear Kernel

In [130]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
pipe = make_pipeline(MinMaxScaler(),SVR(kernel='linear'))
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
mean_squared_error(y_pred,y_test,squared=False)

9330.110211252331

In [47]:
pipe.fit(X_res,y_res)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.46      0.47      0.47       229
         1.0       0.55      0.55      0.55       274

    accuracy                           0.51       503
   macro avg       0.51      0.51      0.51       503
weighted avg       0.51      0.51      0.51       503



## RBF Kernel

In [131]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
pipe = make_pipeline(MinMaxScaler(),SVR(kernel='rbf'))
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
mean_squared_error(y_pred,y_test,squared=False)

28.077697972057216

## Sigmoid Kernel

In [132]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
pipe = make_pipeline(MinMaxScaler(),SVR(kernel= 'sigmoid'))
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
mean_squared_error(y_pred,y_test,squared=False)

29.229015322211435