# SVM

### Taking AAPL Daily stock data as an example

In [1]:
import pandas as pd
import numpy as np

freq = 'Daily'
stock = 'AAPL'

price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
price = price.loc[(price.Date<='2019-12-31')&(price.Date>='2010-01-04'),:]
price = price.loc[(price.Date>='2010-01-04'),:]
y = price.direction2.shift(-1).values[:-1]
predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv')
NLP = pd.read_csv('../predictors/NLP/'+freq+'/NYT_macro_SA.csv')
predictors = pd.merge(predictors,NLP,how='left',on=['Date'])
predictors = predictors.loc[predictors.Date <= '2019-12-31',:]
predictors.set_index('Date',inplace=True)
predictors.fillna(0,inplace=True) 
X = predictors.values[:-1]

In [18]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix,classification_report,f1_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
import seaborn as sns

## Linear Kernel

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
sm = SMOTE(sampling_strategy='minority',random_state=42,k_neighbors=5)
X_res,y_res = sm.fit_resample(X_train,y_train)
pipe = make_pipeline(StandardScaler(),SVC(kernel='linear'))
pipe.fit(X_res,y_res)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.87      0.04      0.07       337
         1.0       0.34      0.99      0.50       166

    accuracy                           0.35       503
   macro avg       0.60      0.51      0.29       503
weighted avg       0.69      0.35      0.22       503



## Polynomial Kernel

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
pipe = make_pipeline(StandardScaler(),SVC(kernel='poly',degree=4))
pipe.fit(X_res,y_res)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.64      0.07      0.12       337
         1.0       0.33      0.92      0.48       166

    accuracy                           0.35       503
   macro avg       0.48      0.49      0.30       503
weighted avg       0.54      0.35      0.24       503



In [7]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.69      0.05      0.10       337
         1.0       0.33      0.95      0.49       166

    accuracy                           0.35       503
   macro avg       0.51      0.50      0.30       503
weighted avg       0.57      0.35      0.23       503



## RBF Kernel

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
pipe = make_pipeline(StandardScaler(),SVC(kernel='rbf'))
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.67      0.99      0.80       337
         1.0       0.20      0.01      0.01       166

    accuracy                           0.66       503
   macro avg       0.43      0.50      0.40       503
weighted avg       0.51      0.66      0.54       503



## Sigmoid Kernel

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
pipe = make_pipeline(StandardScaler(),SVC(kernel= 'sigmoid'))
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.59      0.37      0.46       337
         1.0       0.27      0.46      0.34       166

    accuracy                           0.40       503
   macro avg       0.43      0.42      0.40       503
weighted avg       0.48      0.40      0.42       503



Based on the above result, we can find that RBF kernel outperform. So, we will select Sigmoid kernel for the future tuning purpose.

# Feature Selection Task

From the part, we will use the weighted F1-score to evaluate the classification problem, which will include the biase situation into the consideration, which will place a penalty on the biased prediction result.

In [12]:
tech_list = ['SMA','EMA','STOCH_k','STOCK_d','RSI','MFI','SAR','AD','MACD','MACD_Signal','MACD_Histo','VWAP','SPY','NDAQ','PC1','PC2']
funda_list = ['pcf','PEG_trailing','dpr','npm','gpm','roa','roe','capital_ratio','de_ratio','cash_ratio','curr_ratio','inv_turn','pay_turn','sale_nwc','rd_sale','accrual']
macro_list = ['gdpr1','gdpr2','cpi','bond20yr','bond30yr','fedfunds','cpir','wpir','unemp','employ']
nlp_list = ['Pos_lag2','Pos_lag3','Neg_lag1','Neg_lag2','Neg_lag3','Neu_lag1','Neu_lag2','Neu_lag3']
stock_list = ['AAPL','AMZN','BRK-B','GOOG','JNJ','META','MSFT','NVDA','TSLA','V']

In [24]:
stock_score = dict()
for j in stock_list:
    freq = 'Daily'
    stock = j
    price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
    price = price.loc[(price.Date<='2019-12-31')&(price.Date>='2010-01-04'),:]
    price = price.loc[(price.Date>='2010-01-04'),:]
    y = price.direction2.shift(-1).values[:-1]
    predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv')
    NLP = pd.read_csv('../predictors/NLP/Daily/NYT_macro_SA.csv')
    predictors = pd.merge(predictors,NLP,how='left',on=['Date'])
    predictors = predictors.loc[predictors.Date <= '2019-12-31',:]
    predictors.set_index('Date',inplace=True)
    predictors.fillna(0,inplace=True) 
    X = predictors.values[:-1]
    alpha_score = []
    for i in range(5,49,5):
        cv = 3
        scores = []
        tscv = TimeSeriesSplit(n_splits=cv)
        for train_index, test_index in tscv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            pipe = make_pipeline(MinMaxScaler(),SVC(kernel='rbf'))
            sfs = SequentialFeatureSelector(pipe,n_jobs = -1,n_features_to_select=i,scoring='f1_weighted')
            sfs.fit(X_train,y_train)
            X_train = sfs.transform(X_train)
            pipe.fit(X_train,y_train)
            X_test =  sfs.transform(X_test)
            y_pred = pipe.predict(X_test)
            scores.append(f1_score(y_pred,y_test,average = 'weighted'))
        average_score = np.mean(scores)
        alpha_score.append(average_score)
    print(j,alpha_score)
    stock_score[j] = alpha_score