This is only to test branches. No worries.

In [1]:
#Dowloading the data from kaggle. I used this website:
#https://www.kaggle.com/datasets/borismarjanovic/price-volume-data-for-all-us-stocks-etfs?select=Stocks
import kagglehub

# Download latest version
path = kagglehub.dataset_download("borismarjanovic/price-volume-data-for-all-us-stocks-etfs")

print("Path to dataset files:", path)



  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\caleb\.cache\kagglehub\datasets\borismarjanovic\price-volume-data-for-all-us-stocks-etfs\versions\3


In [2]:
#Combining all the data from the stock folder into one dataframe
import os
import pandas as pd
import glob

# Get path to 'stocks' folder
stocks_path = os.path.join(path, "stocks")
txt_files = glob.glob(os.path.join(stocks_path, "*.txt"))

dfs = []
for file in txt_files:
    ticker = os.path.basename(file).replace(".txt", "")
    
    try:
        with open(file, 'r') as f:
            header = f.readline().strip()
            if not header or ',' not in header:
                continue  # Skip files without a proper header

        df = pd.read_csv(file)
        if df.empty or len(df.columns) < 6:
            continue  # Skip empty or malformed data

        df['Ticker'] = ticker
        dfs.append(df)
        
    except Exception as e:
        print(f"Error loading {ticker}: {e}")

# Combine all successfully loaded files
stock_data = pd.concat(dfs, ignore_index=True)
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

print(f"Loaded {len(dfs)} tickers into dataframe of shape: {stock_data.shape}")
print(stock_data.head())


Loaded 7163 tickers into dataframe of shape: (14887665, 8)
        Date    Open    High     Low   Close    Volume  OpenInt Ticker
0 1999-11-18  30.713  33.754  27.002  29.702  66277506        0   a.us
1 1999-11-19  28.986  29.027  26.872  27.257  16142920        0   a.us
2 1999-11-22  27.886  29.702  27.044  29.702   6970266        0   a.us
3 1999-11-23  28.688  29.446  27.002  27.002   6332082        0   a.us
4 1999-11-24  27.083  28.309  27.002  27.717   5132147        0   a.us


In [3]:
#Filtering stocks by 1. Too little data (<2 yrs). or 2. Low trading volume
# Count the number of rows per ticker
min_days = 500
ticker_counts = stock_data['Ticker'].value_counts()

# Filter to include only tickers with at least min_days of data
valid_tickers = ticker_counts[ticker_counts >= min_days].index
filtered_data = stock_data[stock_data['Ticker'].isin(valid_tickers)]

# Compute average volume per ticker
avg_volume = filtered_data.groupby('Ticker')['Volume'].mean()

# Keep only tickers above a threshold
min_volume = 50000
liquid_tickers = avg_volume[avg_volume >= min_volume].index
filtered_data = filtered_data[filtered_data['Ticker'].isin(liquid_tickers)]

print(f"Final dataset shape: {filtered_data.shape}")
print(f"Unique tickers remaining: {filtered_data['Ticker'].nunique()}")
print(filtered_data.head())


Final dataset shape: (11587794, 8)
Unique tickers remaining: 4097
        Date    Open    High     Low   Close    Volume  OpenInt Ticker
0 1999-11-18  30.713  33.754  27.002  29.702  66277506        0   a.us
1 1999-11-19  28.986  29.027  26.872  27.257  16142920        0   a.us
2 1999-11-22  27.886  29.702  27.044  29.702   6970266        0   a.us
3 1999-11-23  28.688  29.446  27.002  27.002   6332082        0   a.us
4 1999-11-24  27.083  28.309  27.002  27.717   5132147        0   a.us


In [4]:
import numpy as np
#Feature engineering
# Assume filtered_data is already sorted by Ticker and Date
filtered_data = filtered_data.sort_values(['Ticker', 'Date']).copy()

# Group by each stock ticker
grouped = filtered_data.groupby('Ticker', group_keys=False)

# Feature 1: Log Return
filtered_data['LogReturn'] = grouped['Close'].apply(lambda x: np.log(x / x.shift(1)))

# Feature 2: 10-Day Moving Average of Close
filtered_data['MA10'] = grouped['Close'].apply(lambda x: x.rolling(window=10).mean())

# Feature 3: 10-Day Volatility (Std Dev of Log Returns)
filtered_data['Volatility10'] = grouped['LogReturn'].apply(lambda x: x.rolling(window=10).std())
print(filtered_data.head)

<bound method NDFrame.head of                Date    Open     High      Low   Close    Volume  OpenInt  \
0        1999-11-18  30.713  33.7540  27.0020  29.702  66277506        0   
1        1999-11-19  28.986  29.0270  26.8720  27.257  16142920        0   
2        1999-11-22  27.886  29.7020  27.0440  29.702   6970266        0   
3        1999-11-23  28.688  29.4460  27.0020  27.002   6332082        0   
4        1999-11-24  27.083  28.3090  27.0020  27.717   5132147        0   
...             ...     ...      ...      ...     ...       ...      ...   
14887660 2017-11-06  10.420  11.5400  10.4200  11.190    977948        0   
14887661 2017-11-07  11.300  11.4200  10.6700  10.830    451210        0   
14887662 2017-11-08  10.700  11.0600  10.3500  10.900    336449        0   
14887663 2017-11-09  11.000  11.8563  10.9700  11.600    463067        0   
14887664 2017-11-10  11.680  13.1500  11.3043  12.460    885587        0   

           Ticker  LogReturn    MA10  Volatility10  
0   

In [5]:
#At this point you will see that some of the engineered features are NaN.
#Why? For a log return, it is because there is no prior close, so it can't calculate the percentage change from one day to the next
#For MA10, it's because there must be at least 10 samples to calculate the smoothed average over 10 days. Hence the first 9 entries are NaN
#For Volatility10, it needs 11 total entries to perfrom the calculation of degree of variation in returns. 

#Fix by deleting entries with NaN. Apparently it's 'standard practice' 
filtered_data = filtered_data.dropna(subset=['LogReturn', 'MA10', 'Volatility10'])
print("Yay! No more NaNs! \n")
print(filtered_data.head)


Yay! No more NaNs! 

<bound method NDFrame.head of                Date    Open     High      Low   Close   Volume  OpenInt  \
10       1999-12-03  30.336  30.8420  29.9090  30.039  3223074        0   
11       1999-12-06  30.547  31.3480  30.5050  30.883  2385046        0   
12       1999-12-07  30.883  31.0520  29.9090  30.547  2348161        0   
13       1999-12-08  30.547  30.7950  30.2490  30.505  2000481        0   
14       1999-12-09  30.547  31.0120  30.5470  30.924  2150096        0   
...             ...     ...      ...      ...     ...      ...      ...   
14887660 2017-11-06  10.420  11.5400  10.4200  11.190   977948        0   
14887661 2017-11-07  11.300  11.4200  10.6700  10.830   451210        0   
14887662 2017-11-08  10.700  11.0600  10.3500  10.900   336449        0   
14887663 2017-11-09  11.000  11.8563  10.9700  11.600   463067        0   
14887664 2017-11-10  11.680  13.1500  11.3043  12.460   885587        0   

           Ticker  LogReturn     MA10  Volatilit

In [6]:
#RSI feature for calculating if stock is overbought or oversold
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

filtered_data['RSI14'] = (
    filtered_data.groupby('Ticker')['Close'].transform(lambda x: compute_rsi(x, 14))
)
#Will come back to this and see if it affects the outcome
#MACD feature to detect if the market is bear/bull market
# --- MACD Feature Function ---
def compute_macd(series, fast=12, slow=26, signal=9):
    ema_fast = series.ewm(span=fast, adjust=False).mean()
    ema_slow = series.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    hist = macd - signal_line
    return pd.DataFrame({
        'MACD': macd,
        'MACD_Signal': signal_line,
        'MACD_Hist': hist
    }, index=series.index)

# --- Apply MACD Per Ticker ---
# Ensure proper sorting
filtered_data = filtered_data.sort_values(['Ticker', 'Date']).reset_index(drop=True)

# Apply MACD and attach results to the original rows
def apply_macd(group):
    macd_result = compute_macd(group['Close']).reset_index(drop=True)
    return pd.concat([group.reset_index(drop=True), macd_result], axis=1)

filtered_data = filtered_data.groupby('Ticker', group_keys=False).apply(apply_macd)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['RSI14'] = (
  filtered_data = filtered_data.groupby('Ticker', group_keys=False).apply(apply_macd)


In [7]:
#Now that we have features, it's time to create some juicy labels to say if the stock price went up that day. 
#This can be twofold a problem because we can either make it a classifier(should we sell or not) or a reggressor (how much do we sell + or -)
#Classifier to start

filtered_data = filtered_data.sort_values(['Ticker', 'Date'])

# Grouped shift to get next-day close price
filtered_data['NextClose'] = filtered_data.groupby('Ticker')['Close'].shift(-1)

# Compute next-day log return
filtered_data['NextLogReturn'] = np.log(filtered_data['NextClose'] / filtered_data['Close'])

# 1 if next day's return is positive, else 0
filtered_data['Target'] = (filtered_data['NextLogReturn'] > 0).astype(int)

filtered_data = filtered_data.dropna(subset=['NextLogReturn'])


In [8]:
from sklearn.model_selection import train_test_split

# Sort data properly
filtered_data = filtered_data.sort_values(['Ticker', 'Date']).copy()

# Split function for a single ticker's time series
def time_split(group, train_ratio=0.8):
    n = len(group)
    split_idx = int(n * train_ratio)
    train = group.iloc[:split_idx]
    test = group.iloc[split_idx:]
    return train, test

# Apply per ticker
train_list = []
test_list = []

for _, group in filtered_data.groupby('Ticker'):
    train_group, test_group = time_split(group)
    train_list.append(train_group)
    test_list.append(test_group)

# Combine all
train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")


Train shape: (9233213, 18), Test shape: (2309514, 18)


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

print(filtered_data.keys())
# Feature columns (use only numeric predictors)
#would use these if I got MACD calculation to work earlier on
features = ['LogReturn', 'MA10', 'Volatility10', 'RSI14', 'MACD', 'MACD_Signal', 'MACD_Hist']
#features = ['LogReturn', 'MA10', 'Volatility10', 'RSI14']
target = 'Target'

# Drop rows with missing values (if any remain)
train_df = train_df.dropna(subset=features + [target])
test_df = test_df.dropna(subset=features + [target])

# Create X and y
X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]


Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'OpenInt', 'Ticker',
       'LogReturn', 'MA10', 'Volatility10', 'RSI14', 'MACD', 'MACD_Signal',
       'MACD_Hist', 'NextClose', 'NextLogReturn', 'Target'],
      dtype='object')


In [10]:
from xgboost import XGBClassifier

# Basic model config — can be tuned later
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Fit to training data
model.fit(X_train, y_train)

#Serializing the model so it can be used in a web application
import pickle

# After training
with open("xgb_stock_model_classifier.pkl", "wb") as f:
    pickle.dump(model, f)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0     0.5197    0.5615    0.5398   1158500
           1     0.5197    0.4777    0.4978   1151012

    accuracy                         0.5197   2309512
   macro avg     0.5197    0.5196    0.5188   2309512
weighted avg     0.5197    0.5197    0.5189   2309512

Confusion matrix:
 [[650446 508054]
 [601177 549835]]


In [12]:
#Now will try this with regression

# Define features and target
features = ['LogReturn', 'MA10', 'Volatility10', 'RSI14', 'MACD', 'MACD_Signal', 'MACD_Hist']
target = 'NextLogReturn'

# Drop rows with missing values
filtered_data = filtered_data.dropna(subset=features + [target])

# Split into train/test
train_df = filtered_data.groupby('Ticker', group_keys=False).apply(lambda x: x.iloc[:int(len(x)*0.8)])
test_df = filtered_data.groupby('Ticker', group_keys=False).apply(lambda x: x.iloc[int(len(x)*0.8):])

X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]


from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

#Serializing the model so it can be used in a web application
import pickle

# After training
with open("xgb_stock_model_regressor.pkl", "wb") as f:
    pickle.dump(model, f)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.6f}")
print(f"MAE: {mae:.6f}")
print(f"R² Score: {r2:.4f}")


  train_df = filtered_data.groupby('Ticker', group_keys=False).apply(lambda x: x.iloc[:int(len(x)*0.8)])
  test_df = filtered_data.groupby('Ticker', group_keys=False).apply(lambda x: x.iloc[int(len(x)*0.8):])


RMSE: 0.027615
MAE: 0.015339
R² Score: -0.0034


In [39]:
#Now predicting for novel data
new_data = {
    'LogReturn': 0.0045,
    'MA10': 153.25,
    'Volatility10': 0.0132,
    'RSI14': 58.7,
    'MACD': 0.12,
    'MACD_Signal': 0.10,
    'MACD_Hist': 0.02
}

#Novel data needed to be generated and have the same features as the training data
X_new = pd.DataFrame([new_data])

#Make the prediction by plugging X_new into the trained model
predicted_return = model.predict(X_new)[0]
print(f"Predicted next-day log return: {predicted_return:.6f}")

percent_change = (np.exp(predicted_return) - 1) * 100
print(f"Expected % change: {percent_change:.2f}%")

#If you know todays price
current_price = 155.00
predicted_price = current_price * np.exp(predicted_return)
print(f"Predicted next-day price: ${predicted_price:.2f}")

#A function to do this for you
def predict_next_day_return(model, current_price, engineered_features: dict):
    import numpy as np
    X = pd.DataFrame([engineered_features])
    log_return = model.predict(X)[0]
    pct_change = (np.exp(log_return) - 1) * 100
    predicted_price = current_price * np.exp(log_return)
    print("Log return:", log_return)
    print("PercentChange", pct_change)
    print("PredictedPrice: ", predicted_price)
    return {
        "LogReturn": log_return,
        "PercentChange": pct_change,
        "PredictedPrice": predicted_price
    }

predict_next_day_return(model, 155.00, new_data)



Predicted next-day log return: 0.000388
Expected % change: 0.04%
Predicted next-day price: $155.06
Log return: 0.00038845808
PercentChange 0.038850307
PredictedPrice:  155.06021


{'LogReturn': np.float32(0.00038845808),
 'PercentChange': np.float32(0.038850307),
 'PredictedPrice': np.float32(155.06021)}