In [62]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LassoLarsCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel
import getFamaFrenchFactors as gff


# Define the ticker symbol for Tesla
ticker = 'TSLA'

# Define the start and end dates for the data
start_date = '2016-01-01'
end_date = '2022-01-01'

# Download the data from Yahoo Finance
tsla = yf.download(ticker, start=start_date, end=end_date)

# Calculate OBV
tsla['daily_return'] = tsla['Adj Close'].pct_change()
tsla['direction'] = np.where(tsla['daily_return'] >= 0, 1, -1)
tsla['direction'][0] = 0
tsla['vol_adjusted'] = tsla['Volume'] * tsla['direction']
tsla['OBV'] = tsla['vol_adjusted'].cumsum()


sp500 = yf.download('^GSPC', start=start_date, end=end_date)
aapl = yf.download('AAPL', start=start_date, end=end_date)
amzn = yf.download('AMZN', start=start_date, end=end_date)
goog = yf.download('GOOG', start=start_date, end=end_date)
cma = yf.download('CMA', start=start_date, end=end_date)
btc = yf.download('BTC-USD', start=start_date, end=end_date)
eth = yf.download('ETH-USD', start=start_date, end=end_date)
xrp = yf.download('XRP-USD', start=start_date, end=end_date)
ltc = yf.download('LTC-USD', start=start_date, end=end_date)
ada = yf.download('ADA-USD', start=start_date, end=end_date)
vix = yf.download('^VIX', start=start_date, end=end_date)

# Calculate additional features
tsla['mom_5_20'] = (tsla['Close'] / tsla['Close'].shift(5)) - 1
tsla['mom_20_100'] = (tsla['Close'] / tsla['Close'].shift(20)) - 1
tsla['mom_60_200'] = (tsla['Close'] / tsla['Close'].shift(60)) - 1

# Calculate Fama French 3 factors
ff_data = gff.famaFrench3Factor(frequency='m') 
ff_data.rename(columns={"date_ff_factors": 'Date'}, inplace=True)
ff_data.set_index('Date',inplace=True)
ff_data = ff_data.resample('D').interpolate()



# Reset index and convert all dates to same timezones, so they become mergable
for x in [sp500,aapl,amzn,goog,tsla,cma,btc,eth,xrp,ltc,ada,ff_data,vix]:
    x.reset_index(inplace=True)
    x['Date'] =  pd.to_datetime(x['Date']).dt.date




df_regressor = pd.DataFrame({
    'SP_500_Adj_Close':sp500['Adj Close'].shift(1),
    'AAPL_Adj_Close':aapl['Adj Close'].shift(1),
    'AMZN_Adj_Close':amzn['Adj Close'].shift(1),
    'GOOG_Adj_Close':goog['Adj Close'].shift(1),
    'CMA_Adj_Close':cma['Adj Close'].shift(1),
    'BTC_Adj_Close':btc['Adj Close'].shift(1),
    'ETH_Adj_Close':eth['Adj Close'].shift(1),
    'XRP_Adj_Close':xrp['Adj Close'].shift(1),
    'LTC_Adj_Close':ltc['Adj Close'].shift(1),
    'ADA_Adj_Close':ada['Adj Close'].shift(1),
    'Fama_French_Mkt_RF':ff_data['Mkt-RF'].shift(1),
    'Fama_French_SMB' : ff_data['SMB'].shift(1),
    'Fama_French_HML' : ff_data['HML'].shift(1),
    'OBV': tsla['OBV'].shift(1),
    'mom_5_20':  tsla['mom_5_20'].shift(1),
    'mom_20_100':  tsla['mom_20_100'].shift(1),
    'mom_60_200':  tsla['mom_60_200'].shift(1),
     'TSLA_CLOSE': tsla['Adj Close'],
    'VIX_IDX':vix['Adj Close'].shift(1)
})



# Remove any rows with missing data
df_regressor.dropna(inplace=True)



# Split the data into features (X) and target (y)
y = df_regressor['TSLA_CLOSE']
df_regressor = df_regressor.drop(columns=['TSLA_CLOSE'],axis=1)
X = df_regressor
# Fit a decision tree model to the data
tree = DecisionTreeRegressor(random_state=0)
tree.fit(X, y)

# Select the top 10 most important features
selector = SelectFromModel(tree, prefit=True, threshold=-np.inf, max_features=10)
selected_features = X.columns[selector.get_support()]

# Print the top 10 most important features
print('Top 10 most important features through Decision Trees:')
for feature in selected_features:
    print(feature)


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tsla['direction'][0] = 0


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
    SP_500_Adj_Close  AAPL_Adj_Close  AMZN_Adj_Close  GOOG_Adj_Close  \
61       2059.739990       25.041708       29.681999       37.247501   
62       2072.780029       25.271469       29.924999       37.495499   
63       2066.129883     

In [63]:
ridge = Ridge(alpha=0.1)
ridge.fit(X, y)

# Select the top 10 most important features
selector = SelectFromModel(ridge, prefit=True, threshold=-np.inf, max_features=10)
selected_features = X.columns[selector.get_support()]

# Print the top 10 most important features
print('Top 10 most important features (Ridge Regression):')
for feature in selected_features:
      print('-', feature)



Top 10 most important features (Ridge Regression):
- AAPL_Adj_Close
- GOOG_Adj_Close
- XRP_Adj_Close
- ADA_Adj_Close
- Fama_French_Mkt_RF
- Fama_French_SMB
- Fama_French_HML
- mom_5_20
- mom_20_100
- mom_60_200


  return linalg.solve(A, Xy, sym_pos=True,


In [64]:
# Fit a LassoLarsCV model to the data
lars = LassoLarsCV(cv=5).fit(X, y)

# Select the top 10 most important features
selector_LARS = SelectFromModel(lars, prefit=True, threshold=-np.inf, max_features=10)
selected_features_LARS = X.columns[selector_LARS.get_support()]

# Print the top 10 most important features
print('Top 10 most important features (LARS):')
for feature in selected_features_LARS:
      print('-', feature)

Top 10 most important features (LARS):
- AAPL_Adj_Close
- CMA_Adj_Close
- LTC_Adj_Close
- ADA_Adj_Close
- Fama_French_Mkt_RF
- Fama_French_SMB
- Fama_French_HML
- mom_20_100
- mom_60_200
- VIX_IDX


In [72]:
# Fit a RandomForest model to the data
rf = RandomForestRegressor(random_state=0, n_estimators=100).fit(X, y)

# Select the top 10 most important features
selector = SelectFromModel(rf, prefit=True, threshold=-np.inf, max_features=10)
selected_features = X.columns[selector.get_support()]
# selector.estimator.columns
# selector.threshold_

# Print the top 10 most important features
print('Top 10 most important features (RandomForest):')
for feature in selected_features:
    print('-', feature)

Top 10 most important features (RandomForest):
- SP_500_Adj_Close
- AAPL_Adj_Close
- AMZN_Adj_Close
- XRP_Adj_Close
- ADA_Adj_Close
- Fama_French_HML
- OBV
- mom_5_20
- mom_20_100
- mom_60_200


In [66]:
# Fit an XGBoost model to the data
xgb_model = xgb.XGBRegressor(random_state=0, n_estimators=100).fit(X, y)

# Select the top 10 most important features
selector = SelectFromModel(xgb_model, prefit=True, threshold=-np.inf, max_features=10)
selected_features = X.columns[selector.get_support()]

# Print the top 10 most important features
print('Top 10 most important features (XGBoost):')
for feature in selected_features:
    print('-', feature)

Top 10 most important features (XGBoost):
- SP_500_Adj_Close
- AAPL_Adj_Close
- AMZN_Adj_Close
- GOOG_Adj_Close
- CMA_Adj_Close
- BTC_Adj_Close
- XRP_Adj_Close
- Fama_French_HML
- OBV
- mom_60_200


## Doubts
1. 