This is only to test branches. No worries.

In [None]:
#Dowloading the data from kaggle. I used this website:
#https://www.kaggle.com/datasets/borismarjanovic/price-volume-data-for-all-us-stocks-etfs?select=Stocks
import kagglehub

# Download latest version
path = kagglehub.dataset_download("borismarjanovic/price-volume-data-for-all-us-stocks-etfs")

print("Path to dataset files:", path)



Path to dataset files: C:\Users\caleb\.cache\kagglehub\datasets\borismarjanovic\price-volume-data-for-all-us-stocks-etfs\versions\3


In [None]:
#Combining all the data from the stock folder into one dataframe
import os
import pandas as pd
import glob

# Get path to 'stocks' folder
stocks_path = os.path.join(path, "stocks")
txt_files = glob.glob(os.path.join(stocks_path, "*.txt"))

dfs = []
for file in txt_files:
    ticker = os.path.basename(file).replace(".txt", "")
    
    try:
        with open(file, 'r') as f:
            header = f.readline().strip()
            if not header or ',' not in header:
                continue  # Skip files without a proper header

        df = pd.read_csv(file)
        if df.empty or len(df.columns) < 6:
            continue  # Skip empty or malformed data

        df['Ticker'] = ticker
        dfs.append(df)
        
    except Exception as e:
        print(f"Error loading {ticker}: {e}")

# Combine all successfully loaded files
stock_data = pd.concat(dfs, ignore_index=True)
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

print(f"Loaded {len(dfs)} tickers into dataframe of shape: {stock_data.shape}")
print(stock_data.head())


Loaded 7163 tickers into dataframe of shape: (14887665, 8)
        Date    Open    High     Low   Close    Volume  OpenInt Ticker
0 1999-11-18  30.713  33.754  27.002  29.702  66277506        0   a.us
1 1999-11-19  28.986  29.027  26.872  27.257  16142920        0   a.us
2 1999-11-22  27.886  29.702  27.044  29.702   6970266        0   a.us
3 1999-11-23  28.688  29.446  27.002  27.002   6332082        0   a.us
4 1999-11-24  27.083  28.309  27.002  27.717   5132147        0   a.us


In [None]:
#Filtering stocks by 1. Too little data (<2 yrs). or 2. Low trading volume
# Count the number of rows per ticker
min_days = 500
ticker_counts = stock_data['Ticker'].value_counts()

# Filter to include only tickers with at least min_days of data
valid_tickers = ticker_counts[ticker_counts >= min_days].index
filtered_data = stock_data[stock_data['Ticker'].isin(valid_tickers)]

# Compute average volume per ticker
avg_volume = filtered_data.groupby('Ticker')['Volume'].mean()

# Keep only tickers above a threshold
min_volume = 50000
liquid_tickers = avg_volume[avg_volume >= min_volume].index
filtered_data = filtered_data[filtered_data['Ticker'].isin(liquid_tickers)]

print(f"Final dataset shape: {filtered_data.shape}")
print(f"Unique tickers remaining: {filtered_data['Ticker'].nunique()}")
print(filtered_data.head())


Final dataset shape: (11587794, 8)
Unique tickers remaining: 4097
        Date    Open    High     Low   Close    Volume  OpenInt Ticker
0 1999-11-18  30.713  33.754  27.002  29.702  66277506        0   a.us
1 1999-11-19  28.986  29.027  26.872  27.257  16142920        0   a.us
2 1999-11-22  27.886  29.702  27.044  29.702   6970266        0   a.us
3 1999-11-23  28.688  29.446  27.002  27.002   6332082        0   a.us
4 1999-11-24  27.083  28.309  27.002  27.717   5132147        0   a.us


In [None]:
import numpy as np
#Feature engineering
# Assume filtered_data is already sorted by Ticker and Date
filtered_data = filtered_data.sort_values(['Ticker', 'Date']).copy()

# Group by each stock ticker
grouped = filtered_data.groupby('Ticker', group_keys=False)

# Feature 1: Log Return
filtered_data['LogReturn'] = grouped['Close'].apply(lambda x: np.log(x / x.shift(1)))

# Feature 2: 10-Day Moving Average of Close
filtered_data['MA10'] = grouped['Close'].apply(lambda x: x.rolling(window=10).mean())

# Feature 3: 10-Day Volatility (Std Dev of Log Returns)
filtered_data['Volatility10'] = grouped['LogReturn'].apply(lambda x: x.rolling(window=10).std())
print(filtered_data.head)

In [33]:
#At this point you will see that some of the engineered features are NaN.
#Why? For a log return, it is because there is no prior close, so it can't calculate the percentage change from one day to the next
#For MA10, it's because there must be at least 10 samples to calculate the smoothed average over 10 days. Hence the first 9 entries are NaN
#For Volatility10, it needs 11 total entries to perfrom the calculation of degree of variation in returns. 

#Fix by deleting entries with NaN. Apparently it's 'standard practice' 
filtered_data = filtered_data.dropna(subset=['LogReturn', 'MA10', 'Volatility10'])
print("Yay! No more NaNs! \n")
print(filtered_data.head)


Yay! No more NaNs! 

<bound method NDFrame.head of                Date    Open     High      Low   Close   Volume  OpenInt  \
10       1999-12-03  30.336  30.8420  29.9090  30.039  3223074        0   
11       1999-12-06  30.547  31.3480  30.5050  30.883  2385046        0   
12       1999-12-07  30.883  31.0520  29.9090  30.547  2348161        0   
13       1999-12-08  30.547  30.7950  30.2490  30.505  2000481        0   
14       1999-12-09  30.547  31.0120  30.5470  30.924  2150096        0   
...             ...     ...      ...      ...     ...      ...      ...   
14887660 2017-11-06  10.420  11.5400  10.4200  11.190   977948        0   
14887661 2017-11-07  11.300  11.4200  10.6700  10.830   451210        0   
14887662 2017-11-08  10.700  11.0600  10.3500  10.900   336449        0   
14887663 2017-11-09  11.000  11.8563  10.9700  11.600   463067        0   
14887664 2017-11-10  11.680  13.1500  11.3043  12.460   885587        0   

           Ticker  LogReturn     MA10  Volatilit

In [37]:
#RSI feature for calculating if stock is overbought or oversold
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

filtered_data['RSI14'] = (
    filtered_data.groupby('Ticker')['Close'].transform(lambda x: compute_rsi(x, 14))
)
#Will come back to this and see if it affects the outcome
# #MACD feature to detect if the market is bear/bull market
# def compute_macd(series, fast=12, slow=26, signal=9):
#     ema_fast = series.ewm(span=fast, adjust=False).mean()
#     ema_slow = series.ewm(span=slow, adjust=False).mean()
#     macd = ema_fast - ema_slow
#     signal_line = macd.ewm(span=signal, adjust=False).mean()
#     hist = macd - signal_line
#     return pd.DataFrame({
#         'MACD': macd,
#         'MACD_Signal': signal_line,
#         'MACD_Hist': hist
#     }, index=series.index)

# # Apply per-ticker and assign result to the main DataFrame
# macd_df = filtered_data.groupby('Ticker')['Close'].apply(compute_macd).reset_index()

# # Merge back into the main DataFrame
# filtered_data = filtered_data.merge(macd_df, on=['Ticker', 'level_1'])  # 'level_1' is the original Date index
# filtered_data = filtered_data.rename(columns={'level_1': 'Date'})

In [39]:
#Now that we have features, it's time to create some juicy labels to say if the stock price went up that day. 
#This can be twofold a problem because we can either make it a classifier(should we sell or not) or a reggressor (how much do we sell + or -)
#Classifier to start

filtered_data = filtered_data.sort_values(['Ticker', 'Date'])

# Grouped shift to get next-day close price
filtered_data['NextClose'] = filtered_data.groupby('Ticker')['Close'].shift(-1)

# Compute next-day log return
filtered_data['NextLogReturn'] = np.log(filtered_data['NextClose'] / filtered_data['Close'])

# 1 if next day's return is positive, else 0
filtered_data['Target'] = (filtered_data['NextLogReturn'] > 0).astype(int)

filtered_data = filtered_data.dropna(subset=['NextLogReturn'])
