In [None]:
#Install packages
!pip install arch
!pip install praw
!pip install nltk
!pip install boto3

In [None]:
#PULL PRICE DATA FOR SELECTED STOCKS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from arch import arch_model
import yfinance as yf

#Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

tickers = [
    'PG', 'MCD', 'TSLA', 'JPM', 'GOOGL', 'AAPL', 'COST', 'KO', 'NFLX', 'PM', 'META',
 'NVDA', 'BRK-B', 'WMT', 'AMZN', 'HD', 'CRWD', 'UNH', 'GOOG', 'MSFT', 'PLTR', 'INTU',
 'ABT', 'JNJ', 'ORCL', 'ACHR', 'TMO', 'PEP', 'RTX', 'VZ', 'CVX', 'WFC', 'IBM', 'CRM',
 'ISRG', 'MRK', 'PGR', 'ACN'
]

start_date = '2022-01-01'
end_date =  '2025-04-11'
#marketDays = 504
price_data = yf.download(tickers,start=start_date, end= end_date)

In [None]:
# Check for missing tickers (Survivorship Bias)
missing_tickers = [t for t in tickers if t not in price_data.columns.get_level_values(1)]
if missing_tickers:
    print("Warning: The following stocks are missing (possibly delisted):", missing_tickers)
else:
    print("No missing stocks detected.")

In [None]:
#Reformat data into a clean pd datafram with one row per stock per day.
changed_df = []
for ticker in tickers:
    ticker_df = price_data.xs(ticker, level="Ticker", axis=1).copy()
    ticker_df.loc[:,'ticker'] = ticker
    ticker_df.loc[:,'date'] = ticker_df.index
    changed_df.append(ticker_df.reset_index(drop=True))

combined_df = pd.concat(changed_df).reset_index(drop=True)
#print(combined_df.columns.tolist())
combined_df = combined_df.drop(columns=['Open','High','Low','Volume'])
combined_df.columns.name = None
print("Columns: ")
print(combined_df.columns.tolist())
print("Data head: ")
print(combined_df.head())
print("Data Shape: ")
print(combined_df.shape)


In [None]:
#Checking the data
combined_df['date'] = pd.to_datetime(combined_df['date'])
full_dates = pd.date_range(start=combined_df['date'].min(),
                           end=combined_df['date'].max(),
                           freq= 'B')
pivot_df = combined_df.pivot(index='date', columns='ticker', values='Close')
complete_df = pivot_df.reindex(full_dates)

missing_counts = complete_df.isna().sum()
print("Missing days by ticker:")
print(missing_counts)

#All stocks were missing the same # days (19). "B" function was not accounting for holidays like July 4th, memorial day, etc...
missing_msft = complete_df[complete_df['MSFT'].isna()]
print("Missing dates for MSFT:")
print(missing_msft.index.tolist())

#Checked, math works out (19 * 15 = 285) and (7815 - 285 = 7530)
#print(complete_df.size)
#complete_df = complete_df.dropna(how='all')
#print(complete_df.size)


In [None]:
# Save yahoo finance data as csv file
df_combined.to_csv('Final_yfinance.csv')

In [None]:
#DATA VALIDATION/CLEANING CONTINUED
missing_rows = combined_df[combined_df.isnull().any(axis=1)]
print(missing_rows) #returned empty dataframe

# Check for overall missing values
print("Null Values per Column: ")
print(combined_df.isnull().sum())

combined_df = combined_df[['date','ticker','Close']]
print(combined_df.head())
print(combined_df.shape)

In [None]:
# Stock Split Check (Detect Large Jumps in Close Price)
price_change = combined_df.groupby('ticker')['Close'].pct_change()
split_suspects = combined_df[np.abs(price_change) > 0.5]  # More than 50% change
if not split_suspects.empty:
    print("Possible stock splits detected:")
    print(split_suspects[['date', 'ticker', 'Close']].head(10))
# Adjust for splits

# Check for Stocks with Excessive Missing Days
expected_days = combined_df['date'].nunique()
ticker_days = combined_df.groupby('ticker')['date'].nunique()
missing_days = expected_days - ticker_days

print("Stocks with missing trading days:")
print(missing_days[missing_days > 0])

# Drop stocks with excessive missing data (>20% missing)
threshold = 0.2 * expected_days
dropped_stocks = missing_days[missing_days > threshold].index.tolist()
combined_df = combined_df[~combined_df['ticker'].isin(dropped_stocks)]
print(f"Dropped stocks due to missing data: {dropped_stocks}")

In [None]:
#CREATE REALIZED VOLATILITY, LAG RETURNS, OTHER COMPUTED FEATURES
combined_df = combined_df.sort_values(by=['date', 'ticker'])

combined_df['Return'] = combined_df.groupby('ticker')['Close'].pct_change()
combined_df['Return_lag1'] = combined_df.groupby('ticker')['Return'].shift(1)
combined_df['Return_lag2'] = combined_df.groupby('ticker')['Return'].shift(2)

combined_df['RealizedVol_3d'] = combined_df.groupby('ticker')['Return'].transform(lambda x: x.rolling(window=3).std().shift(1))
combined_df['Volatility_lag1'] = combined_df.groupby('ticker')['RealizedVol_3d'].shift(1)
combined_df['Volatility_lag2'] = combined_df.groupby('ticker')['RealizedVol_3d'].shift(2)

combined_df['Target'] = combined_df.groupby('ticker')['RealizedVol_3d'].shift(-1)
combined_df = combined_df.dropna(subset=['Return', 'RealizedVol_3d', 'Return_lag1', 'Volatility_lag1', 'Target'])

print(combined_df.shape)
print(combined_df[['date', 'ticker', 'RealizedVol_3d', 'Target']].head(10))

In [None]:
#FINANCE FOR 3 YEAR DATA
from google.colab import files
df_combined1 = combined_df.drop(columns=['Return_lag1', 'Return', 'Close', 'Return_lag2', 'Volatility_lag1', 'Volatility_lag2'])
df_combined1.to_csv('Final_yfinance.csv')
files.download('Final_yfinance.csv')

In [None]:
# Outlier Detection & Handling (Winsorizing Extreme Returns)
z_scores = combined_df.groupby('ticker')['Return'].transform(lambda x: (x - x.mean()) / x.std())
outliers = combined_df[np.abs(z_scores) > 5]
print(f"Extreme return outliers found: {outliers.shape[0]} rows")
combined_df['Return'] = combined_df['Return'].clip(lower=-0.1, upper=0.1)  # Cap returns at Â±10%

In [None]:
import pandas as pd
from arch import arch_model

#define the start and end dates
start_date = '2022-01-01'
end_date = '2025-04-11'

#load the data (replace 'combined_df' with your actual data)
combined_df['date'] = pd.to_datetime(combined_df['date'])

#filter the data to only include the date range you want
df_filtered = combined_df[(combined_df['date'] >= start_date) & (combined_df['date'] <= end_date)]

#initialize the list to store the results
garch_volatility = []

#define the window size for the rolling period
window = 3

#iterate over each ticker and its data within the date range
for ticker in df_filtered['ticker'].unique():
    ticker_data = df_filtered[df_filtered['ticker'] == ticker].copy()

    for target_date in ticker_data['date'].unique():  # Iterate over each unique date
        #filter data up to the target date (window-based approach)
        data_up_to_target = ticker_data[ticker_data['date'] <= target_date]

        if len(data_up_to_target) <= window:
            print(f"Not enough data for {ticker} on {target_date}. Skipping.")
            continue

        #calculate returns for the window period
        returns = data_up_to_target['Return'].dropna() * 100  # Daily returns (percentage)

        #fit GARCH model on this window of data
        model = arch_model(returns, vol='GARCH', p=3, q=3)
        model_fit = model.fit(disp='off')

        #forecast volatility for the next period
        forecast = model_fit.forecast(start=len(returns)-1, horizon=1)
        volatility = forecast.variance.iloc[-1, 0] ** 0.5  # Convert to standard deviation (volatility)

        #append the result for each date
        garch_volatility.append({
            'ticker': ticker,
            'date': target_date,
            'Rolling_GARCH_volatility %': volatility
        })

#convert the results to a DataFrame
garch_volatility_df = pd.DataFrame(garch_volatility)

In [None]:
# Save the results to a CSV file
from google.colab import files

garch_volatility_df.to_csv('garch_volatility_predictions.csv', index=False)
files.download('garch_volatility_predictions.csv')

print("GARCH volatility predictions saved to 'garch_volatility_predictions.csv'.")