In [125]:
import yfinance as yf
import datetime

# Define start and end dates
start = datetime.datetime(2019, 1, 2)
end = datetime.datetime(2020, 12, 31)

# Define list of tickers
tickers = ['AAPL', 'HD', 'JNJ', 'JPM', 'MSFT', 'UNH', 'V', 'XOM']

# Use yfinance to download adjusted close prices for the tickers
prices = yf.download(tickers, start=start, end=end)['Adj Close']
dji = yf.download("^DJI", start = start, end = end)["Adj Close"]

[*********************100%***********************]  8 of 8 completed
[*********************100%***********************]  1 of 1 completed


In [126]:
prices

Unnamed: 0_level_0,AAPL,HD,JNJ,JPM,MSFT,UNH,V,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-02,38.047043,155.546707,114.144211,86.554695,96.632652,228.546432,129.230560,55.378773
2019-01-03,34.257286,152.118393,112.330406,85.324615,93.077744,222.313934,124.573540,54.528488
2019-01-04,35.719700,156.638336,114.215683,88.470131,97.406715,224.913910,129.940292,56.538956
2019-01-07,35.640190,159.723816,113.483025,88.531639,97.530945,225.345673,132.283432,56.832970
2019-01-08,36.319611,160.490677,116.118843,88.364693,98.238113,228.358658,133.002884,57.246185
...,...,...,...,...,...,...,...,...
2020-12-23,129.209290,255.949387,143.379532,116.491623,216.605530,327.533112,202.118820,37.559818
2020-12-24,130.205795,257.002350,143.879684,115.979332,218.300949,330.637756,205.466110,37.406948
2020-12-28,134.862671,255.418137,144.559113,116.743095,220.466827,335.644043,209.335236,37.532845
2020-12-29,133.066986,252.515366,145.455582,116.435738,219.672974,337.002289,211.048279,37.110218


In [127]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Fill in missing values using previous day's value
prices = prices.fillna(method='ffill')

# Calculate the daily log returns
returns = pd.DataFrame()
for ticker in tickers:
    returns[ticker] = prices[ticker].apply(lambda x: np.log(x)).diff()

returns = returns.dropna()
# Calculate the next day's DJI return and define labels as sign of the return
dji_returns = dji.apply(lambda x: np.log(x)).diff()
dji_returns = dji_returns.dropna()

labels = dji_returns.apply(lambda x: 1 if x > 0 else -1)

# Split data into training and test datasets
split_date = datetime.datetime(2019, 1, 4) + pd.DateOffset(days=int(prices.shape[0]*0.8))
X_train = returns.loc[:split_date]
X_test = returns.loc[split_date+pd.DateOffset(days=1):]
y_train = labels.loc[:split_date]
y_test = labels.loc[split_date+pd.DateOffset(days=1):]

# Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [128]:
returns

Unnamed: 0_level_0,AAPL,HD,JNJ,JPM,MSFT,UNH,V,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-03,-0.104924,-0.022287,-0.016018,-0.014314,-0.037482,-0.027649,-0.036702,-0.015473
2019-01-04,0.041803,0.029280,0.016644,0.036202,0.045460,0.011627,0.042179,0.036207
2019-01-07,-0.002228,0.019507,-0.006435,0.000695,0.001275,0.001918,0.017872,0.005187
2019-01-08,0.018884,0.004790,0.022961,-0.001888,0.007225,0.013282,0.005424,0.007244
2019-01-09,0.016839,0.010290,-0.007957,-0.001692,0.014198,0.001438,0.011700,0.005261
...,...,...,...,...,...,...,...,...
2020-12-23,-0.007000,-0.000852,-0.005121,0.027561,-0.013125,0.007672,-0.002627,0.012770
2020-12-24,0.007683,0.004106,0.003482,-0.004407,0.007797,0.009434,0.016425,-0.004078
2020-12-28,0.035141,-0.006183,0.004711,0.006564,0.009873,0.015028,0.018656,0.003360
2020-12-29,-0.013404,-0.011430,0.006182,-0.002636,-0.003607,0.004039,0.008150,-0.011324


In [129]:
'''
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Fill in missing values using previous day's value
prices = prices.fillna(method='ffill')

# Calculate the daily log returns
returns = pd.DataFrame()
for ticker in tickers:
    returns[ticker] = prices[ticker].apply(lambda x: np.log(x)).diff()
returns = returns.dropna()

# Calculate the next day's DJI return and define labels as sign of the return
dji_returns = dji.shift(1)
dji_returns = dji_returns[1:]
labels = dji_returns.apply(lambda x: 1 if x > 0 else -1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(returns, labels, test_size=0.2, random_state=8990)

# Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
'''

"\nfrom sklearn.model_selection import train_test_split\nimport pandas as pd\nimport numpy as np\nfrom sklearn.preprocessing import StandardScaler\n\n# Fill in missing values using previous day's value\nprices = prices.fillna(method='ffill')\n\n# Calculate the daily log returns\nreturns = pd.DataFrame()\nfor ticker in tickers:\n    returns[ticker] = prices[ticker].apply(lambda x: np.log(x)).diff()\nreturns = returns.dropna()\n\n# Calculate the next day's DJI return and define labels as sign of the return\ndji_returns = dji.shift(1)\ndji_returns = dji_returns[1:]\nlabels = dji_returns.apply(lambda x: 1 if x > 0 else -1)\n\n# Split the data into training and test sets\nX_train, X_test, y_train, y_test = train_test_split(returns, labels, test_size=0.2, random_state=8990)\n\n# Standardize the feature values\nscaler = StandardScaler()\nX_train = scaler.fit_transform(X_train)\nX_test = scaler.transform(X_test)\n"

In [130]:
X_train

array([[-6.48478557, -2.09096053, -1.66294943, ..., -1.76514479,
        -3.30683349, -1.26780122],
       [ 2.36525601,  2.47922824,  1.581616  , ...,  0.68069348,
         3.51102532,  3.04645237],
       [-0.29056137,  1.61301755, -0.71102806, ...,  0.07606273,
         1.41010513,  0.45689277],
       ...,
       [-0.98162285, -0.68906716, -1.13858993, ..., -0.73710526,
        -0.26241481, -0.53107058],
       [ 0.12964949,  1.11188391, -0.09136643, ..., -0.51669178,
         1.23995551, -0.86244296],
       [-0.52116071,  0.26282523,  0.01323446, ...,  0.76707903,
        -0.99905233,  0.8137634 ]])

In [131]:
X_test

array([[ 1.25948041, -0.1341031 , -0.66171318, ...,  2.61659363,
         1.33614341,  1.03829074],
       [-0.58721347,  0.16624011, -0.73141538, ..., -0.30656894,
        -0.02617554, -0.44063725],
       [-0.14129649,  0.85525782, -0.04527883, ..., -0.75215067,
         1.06147001, -0.36061188],
       ...,
       [ 1.96341001, -0.66376413,  0.39622879, ...,  0.89245968,
         1.47787523,  0.30439126],
       [-0.96465259, -1.12874521,  0.54237089, ...,  0.20812372,
         0.56982321, -0.92143827],
       [-0.67265158, -0.42594809,  1.15160711, ..., -0.46791335,
         1.45934993,  0.68875088]])

In [132]:
y_train.value_counts()

 1    161
-1    118
Name: Adj Close, dtype: int64

In [133]:
y_test

Date
2020-02-12    1
2020-02-13   -1
2020-02-14   -1
2020-02-18   -1
2020-02-19    1
             ..
2020-12-23    1
2020-12-24    1
2020-12-28    1
2020-12-29   -1
2020-12-30    1
Name: Adj Close, Length: 224, dtype: int64

In [134]:
from sklearn.svm import SVC

# Fit SVM model with RBF kernel and C=1
svm = SVC(kernel='rbf', gamma=1, C=1)
svm.fit(X_train, y_train)

SVC(C=1, gamma=1)

In [135]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.svm import SVC

# Set up the SVM model with RBF kernel, gamma=1, and C=1
svm_model = SVC(kernel='rbf', gamma=1, C=1)

# Perform time series cross-validation with 5 splits
tscv = TimeSeriesSplit(n_splits=5)

# Compute the cross-validated scores for the trained model
cv_scores = cross_val_score(svm_model, X_train, y_train, cv=tscv)

# Compute the mean of the cross-validated scores
mean_cv_score = cv_scores.mean()

print("Mean cross-validated score: {:.4f}".format(mean_cv_score))


Mean cross-validated score: 0.7043


In [136]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 5, 10]}
svm = SVC(kernel='rbf')
grid_search = GridSearchCV(svm, param_grid, cv=TimeSeriesSplit(n_splits=5))
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


Best parameters:  {'C': 1, 'gamma': 0.1}
Best cross-validation score:  0.8478260869565217


In [137]:
split_date

Timestamp('2020-02-11 00:00:00')

In [138]:
prices.iloc[279]

AAPL     78.344841
HD      223.943039
JNJ     139.630722
JPM     124.826881
MSFT    178.861328
UNH     277.559296
V       199.551620
XOM      51.127171
Name: 2020-02-11 00:00:00, dtype: float64

In [139]:
dji.iloc[279]

29276.33984375

In [140]:
len(prices["AAPL"]) == len(dji)

True

In [141]:
import numpy as np

# Fill in missing values using previous day's value
dji = np.nan_to_num(dji, nan=np.nan)
dji = np.where(np.isnan(dji), np.roll(dji, 1), dji)

# Calculate the daily log returns
dji_returns = np.log(dji[1:]) - np.log(dji[:-1])

# Train an SVM model with RBF kernel and optimized hyperparameters
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X_train = X_train[:, 1:]  # exclude the DJI returns
X_test = X_test[:, 1:]
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 5, 10]}
svm = SVC(kernel='rbf')
grid = GridSearchCV(svm, param_grid=param_grid, cv=TimeSeriesSplit(n_splits=5), scoring='accuracy')
pipe = Pipeline([('scaler', StandardScaler()), ('svm', grid)])
pipe.fit(X_train, y_train)
print(f'Best hyperparameters: {pipe.named_steps["svm"].best_params_}')

# Simulate a trading strategy using the trained SVM model
capital = 10000  # initial capital amount
shares = 0  # number of shares held
returns = []  # daily returns
for i in range(len(X_test)):
    signal = pipe.predict(X_test[i].reshape(1, -1))[0]
    j = 279 + i + 1
    if signal == 1 and shares == 0:
        shares = capital / dji[j]
        capital = 0
    elif signal == -1 and shares > 0:
        capital = shares * dji[j]
        shares = 0
    returns.append((capital + shares * dji[j]) / 10000 - 1)
    
# Compute the final amount at market close of Dec 31st 2020
final_amount = (capital + shares * dji[-1]) / 10000 * 10000
print(f'Final amount at market close of Dec 31st 2020: ${final_amount:.2f}')


Best hyperparameters: {'C': 1, 'gamma': 0.1}
Final amount at market close of Dec 31st 2020: $10361.98


In [142]:
# Simple Buy and Hold Strategy Implementation
import numpy as np

# Number of shares bought on Oct 1st 2020 with initial capital
num_shares = 10000 / dji[0]

# Final amount at market close of Dec 31st 2020
final_amount = num_shares * dji[-1]

print(f"Final amount for buy-and-hold strategy: ${np.round(final_amount, 2)}")


Final amount for buy-and-hold strategy: $13025.46


I think we will make more money by the simple buy-and hold strategy as per our backtesting above in the given periods of time.