In [1]:

import yfinance as yf
import pandas as pd
import ta
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import warnings

In [2]:
warnings.filterwarnings('ignore')
pfe = yf.download("PFE", start="2010-01-01", end="2025-04-16")
print(pfe.head())

pfe['momentum_5'] = pfe['Close'].pct_change(periods=5)
pfe['momentum_20'] = pfe['Close'].pct_change(periods=20)
pfe['volatility_20'] = pfe['Close'].pct_change().rolling(window=20).std()
#print(type(pfe['Close']))

pfe['volume_avg_20'] = pfe['Volume'].rolling(window=20).mean()
pfe['volume_surprise'] = pfe['Volume'].squeeze() / pfe['volume_avg_20']
#pfe['Volume'] = pfe['Volume'].squeeze()

#print(type(pfe['Volume']))
#print(type(pfe['volume_avg_20']))


pfe['intraday_change'] = pfe['Close'] - pfe['Open']
gains = pfe['intraday_change'].apply(lambda x: x if x > 0 else 0)
losses = pfe['intraday_change'].apply(lambda x: -x if x < 0 else 0)
avg_gain = gains.rolling(window=14).mean()
avg_loss = losses.rolling(window=14).mean()
pfe['imi'] = 100 * avg_gain / (avg_gain + avg_loss)


pfe['rsi'] = ta.momentum.RSIIndicator(pfe['Close'].squeeze(), window=14).rsi()


macd = ta.trend.MACD(pfe['Close'].squeeze())
pfe['macd'] = macd.macd()
pfe['macd_signal'] = macd.macd_signal()


bollinger = ta.volatility.BollingerBands(pfe['Close'].squeeze())
pfe['bollinger_mavg'] = bollinger.bollinger_mavg()
pfe['bollinger_hband'] = bollinger.bollinger_hband()
pfe['bollinger_lband'] = bollinger.bollinger_lband()

pfe.dropna(inplace=True)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Price          Close      High       Low      Open    Volume
Ticker           PFE       PFE       PFE       PFE       PFE
Date                                                        
2010-01-04  9.683945  9.689060  9.330965  9.346311  54898644
2010-01-05  9.545827  9.683950  9.489555  9.678834  45714931
2010-01-06  9.515130  9.622559  9.469090  9.545824  43640975
2010-01-07  9.479319  9.550938  9.443509  9.535591  41557112
2010-01-08  9.556055  9.571402  9.474205  9.525362  32049716





In [3]:

fundamental_df = pd.read_csv("fundamental_data.csv", index_col='Date', parse_dates=True)
fundamental_df = fundamental_df.drop(columns=['beta_'])


if isinstance(pfe.columns, pd.MultiIndex):
    pfe.columns = ['_'.join(map(str, col)).strip() for col in pfe.columns]


pfe = pd.merge(pfe, fundamental_df, how='left', left_on='Date', right_index=True)




print(pfe.head())


FileNotFoundError: [Errno 2] No such file or directory: 'fundamental_data.csv'

In [4]:
market = yf.download("^GSPC", start=pfe.index.min(), end=pfe.index.max())['Close']
rf = yf.download("^TNX", start=pfe.index.min(), end=pfe.index.max())['Close']  

#Daily returns
pfe_ret = pfe['Close_PFE'].pct_change()
market_ret = market.pct_change()
rf_daily = (rf / 100) / 252  

returns = pd.concat([pfe_ret, market_ret, rf_daily], axis=1)
returns.columns = ['Ri', 'Rm', 'Rf']
returns = returns.dropna()


X = (returns['Rm'] - returns['Rf']).values.reshape(-1, 1)
y = (returns['Ri'] - returns['Rf']).values
reg = LinearRegression().fit(X, y)
beta = reg.coef_[0]


returns['beta'] = beta
returns['alpha'] = returns['Ri'] - (returns['Rf'] + beta * (returns['Rm'] - returns['Rf']))
if isinstance(pfe.columns, pd.MultiIndex):
    pfe.columns = ['_'.join(map(str, col)).strip() for col in pfe.columns]
pfe = pfe.merge(returns[['alpha']], left_index=True, right_index=True, how='left')
pfe = pfe.merge(returns[['beta']], left_index=True, right_index=True, how='left')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [5]:
pfe['alpha'] = pfe['alpha'].fillna(method='bfill')  
pfe['beta'] = pfe['beta'].fillna(method='bfill')
pfe['return_t-1'] = pfe['Close_PFE'].pct_change(1)
pfe['return_t-5'] = pfe['Close_PFE'].pct_change(5)

pfe['ma_5'] = pfe['Close_PFE'].rolling(5).mean()
pfe['ma_20'] = pfe['Close_PFE'].rolling(20).mean()
pfe['ma_diff'] = pfe['ma_5'] - pfe['ma_20']


pfe['vol_5'] = pfe['Close_PFE'].rolling(5).std()


In [6]:
jnj = yf.download("JNJ", start="2010-01-01", end="2025-04-16").fillna(0).astype(int)
mrk = yf.download("MRK", start="2010-01-01", end="2025-04-16").fillna(0).astype(int)
bmy = yf.download("BMY", start="2010-01-01", end="2025-04-16").fillna(0).astype(int)
abbv = yf.download("ABBV", start="2010-01-01", end="2025-04-16").fillna(0).astype(int)
amgn = yf.download("AMGN", start="2010-01-01", end="2025-04-16").fillna(0).astype(int)

jnj.columns = [f"JNJ_{col}" for col in jnj.columns.get_level_values(0)]
mrk.columns = [f"MRK_{col}" for col in mrk.columns.get_level_values(0)]
bmy.columns = [f"BMY_{col}" for col in bmy.columns.get_level_values(0)]
abbv.columns = [f"ABBV_{col}" for col in abbv.columns.get_level_values(0)]
amgn.columns = [f"AMGN_{col}" for col in amgn.columns.get_level_values(0)]

if isinstance(pfe.columns, pd.MultiIndex):
    pfe.columns = ['_'.join(map(str, col)).strip() for col in pfe.columns]


pfe = pfe.join(jnj, how='left')
pfe = pfe.join(mrk, how='left')
pfe = pfe.join(bmy, how='left')
pfe = pfe.join(abbv, how='left')
pfe['ABBV_Close'] = pfe['ABBV_Close'].fillna(0)
pfe['ABBV_High'] = pfe['ABBV_High'].fillna(0)
pfe['ABBV_Low'] = pfe['ABBV_Low'].fillna(0)
pfe['ABBV_Open'] = pfe['ABBV_Open'].fillna(0)
pfe['ABBV_Volume'] = pfe['ABBV_Volume'].fillna(0)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [7]:
recall_flag_df = pd.read_csv("Recall_Flag_Table.csv", parse_dates=['Date'], index_col='Date')
recall_flag_df.index = recall_flag_df.index.normalize()
if isinstance(pfe.columns, pd.MultiIndex):
    pfe.columns = ['_'.join(map(str, col)).strip() for col in pfe.columns]

pfe = pfe.join(recall_flag_df, how='left')

pfe['recall_flag'] = pfe['recall_flag'].fillna(0).astype(int)

In [8]:
#MacroData
cpi = pd.read_csv("CPIAUCSL.csv", parse_dates=['observation_date'], index_col='observation_date')
dgs10 = pd.read_csv("DGS10.csv", parse_dates=['observation_date'], index_col='observation_date')
inflation = pd.read_csv("ExpectedInflation.csv", parse_dates=['observation_date'], index_col='observation_date')
fedfunds = pd.read_csv("FEDFUNDS.csv", parse_dates=['observation_date'], index_col='observation_date')
m2 = pd.read_csv("M2SL.csv", parse_dates=['observation_date'], index_col='observation_date')
m1 = pd.read_csv("MPRIME.csv", parse_dates=['observation_date'], index_col='observation_date')

cpi.columns = ['CPI']
dgs10.columns = ['10Y_Treasury_Yield']
inflation.columns = ['Expected_Inflation']
fedfunds.columns = ['Fed_Funds_Rate']
m2.columns = ['M2_Money_Supply']
m1.columns = ['M1_Money_Supply']

macro_df = cpi.join([dgs10, inflation, fedfunds, m2, m1], how='outer')
macro_df.index = macro_df.index.normalize()  

macro_df = macro_df.ffill()
if isinstance(pfe.columns, pd.MultiIndex):
    pfe.columns = ['_'.join(map(str, col)).strip() for col in pfe.columns]

pfe = pfe.join(macro_df, how='left')

pfe.to_csv("pfe_with_macro.csv")


In [9]:
event_df = pd.read_csv("Corrected_Pfizer_Event_Timeline__2010_2029_.csv", parse_dates=True, index_col='date')
event_df.index = event_df.index.normalize()
if isinstance(pfe.columns, pd.MultiIndex):
    pfe.columns = ['_'.join(map(str, col)).strip() for col in pfe.columns]

pfe = pfe.join(event_df, how='left')


In [10]:
sentiment_df = pd.read_csv("news_data_full.csv", parse_dates=['date'], index_col='date')
sentiment_df.index = sentiment_df.index.normalize()
pfe = pfe.merge(sentiment_df, how='left', left_index=True, right_index=True)
pfe = pfe.rename(columns={'sentiment': 'news_sentiment'})


In [11]:
sentiment_df = pd.read_csv("Complete_Sentiment_Table_with_Zeros.csv", parse_dates=['date'], index_col='date')
sentiment_df.index = sentiment_df.index.normalize()
sentiment_df = sentiment_df.rename(columns={'compound': 'twitter_sentiment'})
pfe = pfe.merge(sentiment_df, how='left', left_index=True, right_index=True)

In [12]:
pfe['target_return'] = pfe['Close_PFE'].shift(-1) / pfe['Close_PFE'] - 1

In [13]:
pfe['target_return_lag1'] = pfe['target_return'].shift(1)
pfe['target_return_lag2'] = pfe['target_return'].shift(2)
pfe['target_return_lag3'] = pfe['target_return'].shift(3)
pfe['target_roll_mean_5'] = pfe['target_return'].rolling(5).mean()
pfe['target_roll_std_5'] = pfe['target_return'].rolling(5).std()
features = ['return_t-1', 'return_t-5', 'ma_5', 'ma_20', 'ma_diff', 'vol_5', 'target_return_lag1','target_return_lag2','target_return_lag3','target_roll_mean_5','target_roll_std_5']
pfe = pfe.dropna(subset=features)

In [14]:
pfe.to_csv("pfe_with_macro.csv")