# UFCFAS-15-2 - Machine learning Group Project

## Machine Learning Stock Trading Tool

#### Sam Waxman
#### Temi Adeolu-Salako
#### Matt Nogodula

## Intro & Objectives

## Preparing Environment

In [None]:
# Standard Library
import os
import sys
import time
import threading
import random
import traceback
from datetime import datetime, timedelta

# Data Manipulation
import pandas as pd
import numpy as np

# HTTP / Web Scraping
import requests
from bs4 import BeautifulSoup

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Finance Data
import yfinance as yf

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Persistence
import joblib

# ML Preprocessing & Metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report

# ML Models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE

## Data Acquisition & Stock Parameters
Application Files: data_prep.py & finvis_sentiment.py

In [11]:
# ▶︎ 4.1 – Parameters: tickers & dates
tickers   = ["AAPL"]   # or pull full S&P 500 via get_sp500_list() below
start_date = "2024-01-01"
end_date   = "2025-01-01"

# ▶︎ 4.2 – Helper: get S&P 500 tickers (optional)
def get_sp500_list(cache_file="sp500_tickers.txt"):
    if os.path.exists(cache_file) and (
       pd.to_datetime("today") - pd.to_datetime(os.path.getmtime(cache_file), unit="s")
    ).days < 30:
        return pd.read_csv(cache_file, header=None)[0].tolist()
    df = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
    symbols = df.Symbol.str.replace(".", "-").tolist()
    pd.Series(symbols).to_csv(cache_file, index=False, header=False)
    return symbols

# ▶︎ 4.3 – Price download with caching
PRICE_CACHE = "price_cache"
os.makedirs(PRICE_CACHE, exist_ok=True)
def fetch_price_data(ticker, start, end, use_cache=True):
    """Download OHLCV via yfinance; cache to CSV for speed."""
    fn = os.path.join(PRICE_CACHE, f"{ticker}_{start}_{end}.csv")
    if use_cache and os.path.exists(fn):
        try:
            # Try reading with "Date" as index column
            df = pd.read_csv(fn, index_col="Date", parse_dates=True)
        except ValueError:
            try:
                # Try reading the first column as index
                df = pd.read_csv(fn, index_col=0, parse_dates=True)
            except Exception as e:
                # If all else fails, download fresh data
                print(f"⚠️ Error reading cache for {ticker}, downloading fresh data: {str(e)}")
                use_cache = False
        else:
            print(f"✔️ Loaded cached prices for {ticker}")
            return df
    
    # Download data if we didn't return from cache
    try:
        print(f"⬇️ Downloading prices for {ticker}")
        df = yf.download(ticker, start=start, end=end, progress=False)
        
        if df.empty:
            print(f"⚠️ Warning: No data returned for {ticker}")
            # Return empty DataFrame with proper columns to avoid errors
            return pd.DataFrame(columns=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], 
                                index=pd.DatetimeIndex([], name='Date'))
        
        # Ensure index is properly named before saving
        df.index.name = 'Date'
        if use_cache:
            df.to_csv(fn)
        return df
    except Exception as e:
        print(f"⚠️ Error downloading data for {ticker}: {str(e)}")
        # Return empty DataFrame with proper columns
        return pd.DataFrame(columns=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'],) 

# ▶︎ 4.4 – Weekly news sentiment from Finviz

SENT_CACHE = "sentiment_cache"
os.makedirs(SENT_CACHE, exist_ok=True)
SENT_CSV = os.path.join(SENT_CACHE, "sentiment_data.csv")
analyzer = SentimentIntensityAnalyzer()

def get_weekly_sentiment(ticker, for_date=None):
    """
    Scrape Finviz headlines once per Monday and cache a 7-day average compound score.
    """
    if for_date is None:
        for_date = datetime.today()
    
    # Convert to datetime if needed
    if not isinstance(for_date, datetime):
        for_date = pd.to_datetime(for_date)
    
    # align to Monday
    monday = for_date - timedelta(days=for_date.weekday())
    monday_date = monday.date()
    
    # Load existing sentiment cache or create new dataframe
    if os.path.exists(SENT_CSV):
        sentiment_df = pd.read_csv(SENT_CSV)
    else:
        sentiment_df = pd.DataFrame(columns=["ticker", "date", "score"])
    
    # Check if we already have this ticker/date combination
    entry = sentiment_df[(sentiment_df.ticker == ticker) & 
                         (sentiment_df.date == str(monday_date))]
    
    if not entry.empty:
        print(f"✓ Using cached sentiment for {ticker} on {monday_date}")
        return entry.score.values[0]
    
    # If not in cache, fetch new data
    print(f"⬇️ Fetching sentiment for {ticker} on {monday_date}")
    url = f"https://finviz.com/quote.ashx?t={ticker}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml",
        "Accept-Language": "en-US,en;q=0.9"
    }
    
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        if resp.status_code != 200:
            print(f"⚠️ HTTP error {resp.status_code} for {ticker}")
            return 0.0
            
        soup = BeautifulSoup(resp.text, "html.parser")
        table = soup.find("table", class_="fullview-news-outer")
        
        if not table:
            print(f"⚠️ News table not found for {ticker}")
            return 0.0
            
        headlines = []
        scores = []
        rows = table.find_all("tr")
        cutoff = monday - timedelta(days=7)
        
        for r in rows:
            try:
                ts = r.td.text.strip()
                title = r.a.text.strip()
                
                # Parse date from finviz format
                try:
                    if " " in ts:
                        date_str = ts.split(" ")[0]
                        dt = datetime.strptime(date_str, "%b-%d-%y")
                    else:
                        dt = monday  # Today's headlines
                except Exception as e:
                    print(f"Date parsing error: {e} - using Monday")
                    dt = monday
                    
                if dt >= cutoff:
                    sentiment = analyzer.polarity_scores(title)["compound"]
                    scores.append(sentiment)
                    headlines.append(f"{dt.date()}: {title[:50]}... ({sentiment:.2f})")
            except Exception as e:
                print(f"Error processing headline: {e}")
                continue
        
        # Show some sample headlines
        if headlines:
            print(f"Found {len(headlines)} headlines. First 3:")
            for h in headlines[:3]:
                print(f"  - {h}")
        else:
            print("⚠️ No headlines found within the 7-day window")
        
        avg_score = float(np.mean(scores)) if scores else 0.0
        print(f"Average sentiment score: {avg_score:.4f}")
        
        # Add new entry to dataframe
        new_entry = pd.DataFrame([{"ticker": ticker, "date": str(monday_date), "score": avg_score}])
        sentiment_df = pd.concat([sentiment_df, new_entry], ignore_index=True)
        
        # Save updated dataframe
        sentiment_df.to_csv(SENT_CSV, index=False)
        
        return avg_score
    
    except requests.RequestException as e:
        print(f"⚠️ Network error for {ticker}: {e}")
        return 0.0
    except Exception as e:
        print(f"⚠️ Unexpected error for {ticker}: {e}")
        import traceback
        traceback.print_exc()
        return 0.0

# ▶︎ 4.5 – Fetch & merge per‐ticker DataFrames
all_data = {}
for t in tickers:
    # Get price data
    prices = fetch_price_data(t, start_date, end_date)
    
    # Skip if no data is available
    if prices.empty:
        print(f"Skipping {t} - no price data available")
        continue
    
    # Add sentiment directly to the prices DataFrame instead of joining
    df = prices.copy()
    sentiment_series = df.index.to_series().apply(lambda d: get_weekly_sentiment(t, d))
    
    # Handle MultiIndex columns if present
    if isinstance(df.columns, pd.MultiIndex):
        # Add sentiment as a new column with proper MultiIndex level
        df[('Sentiment', '')] = sentiment_series
    else:
        # Simple column addition for single-level columns
        df['Sentiment'] = sentiment_series
    
    # Forward fill missing sentiment values
    if isinstance(df.columns, pd.MultiIndex):
        df[('Sentiment', '')].ffill(inplace=True)
    else:
        df['Sentiment'].ffill(inplace=True)
    
    all_data[t] = df

# ▶︎ 4.6 – Preview one DataFrame
sample = all_data[tickers[0]]
print(sample.head(), "\n")
print("Missing values per column:\n", sample.isna().sum())


  df = pd.read_csv(fn, index_col=0, parse_dates=True)


⬇️ Downloading prices for AAPL
⬇️ Fetching sentiment for AAPL on 2024-01-01
Date parsing error: time data 'Today' does not match format '%b-%d-%y' - using Monday
Found 100 headlines. First 3:
  - 2024-01-01: Warren Buffett Steps Down as CEO and Other Key Tak... (0.00)
  - 2024-01-01: Buffett's $348B Signal: Why He's Trimming the Fat ... (0.00)
  - 2024-01-01: Apple CEO sends blunt message on tariffs impact... (0.00)
Average sentiment score: 0.0615
✓ Using cached sentiment for AAPL on 2024-01-01
✓ Using cached sentiment for AAPL on 2024-01-01
✓ Using cached sentiment for AAPL on 2024-01-01
⬇️ Fetching sentiment for AAPL on 2024-01-08


  sentiment_df = pd.concat([sentiment_df, new_entry], ignore_index=True)


Date parsing error: time data 'Today' does not match format '%b-%d-%y' - using Monday
Found 100 headlines. First 3:
  - 2024-01-08: Warren Buffett Steps Down as CEO and Other Key Tak... (0.00)
  - 2024-01-08: Buffett's $348B Signal: Why He's Trimming the Fat ... (0.00)
  - 2024-01-08: Apple CEO sends blunt message on tariffs impact... (0.00)
Average sentiment score: 0.0615
✓ Using cached sentiment for AAPL on 2024-01-08
✓ Using cached sentiment for AAPL on 2024-01-08
✓ Using cached sentiment for AAPL on 2024-01-08
✓ Using cached sentiment for AAPL on 2024-01-08
⬇️ Fetching sentiment for AAPL on 2024-01-15
Date parsing error: time data 'Today' does not match format '%b-%d-%y' - using Monday
Found 100 headlines. First 3:
  - 2024-01-15: Warren Buffett Steps Down as CEO and Other Key Tak... (0.00)
  - 2024-01-15: Buffett's $348B Signal: Why He's Trimming the Fat ... (0.00)
  - 2024-01-15: Apple CEO sends blunt message on tariffs impact... (0.00)
Average sentiment score: 0.0615
✓ Using cac

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[('Sentiment', '')].ffill(inplace=True)


## Feature Engineering & Indicators
Application Files: indicators.py

In [15]:
# 5. FEATURE ENGINEERING – Technical + Sentiment Indicators

def calculate_rsi(series, window=14):
    delta = series.diff()
    gain  = delta.clip(lower=0)
    loss  = -delta.clip(upper=0)
    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean().replace(0, 1e-10)
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

def calculate_macd(series, slow=26, fast=12, signal=9):
    ema_fast = series.ewm(span=fast, adjust=False).mean()
    ema_slow = series.ewm(span=slow, adjust=False).mean()
    macd     = ema_fast - ema_slow
    sig_line = macd.ewm(span=signal, adjust=False).mean()
    return macd, sig_line

def calculate_sma(series, window=20):
    return series.rolling(window).mean()

def calculate_ema(series, span=20):
    return series.ewm(span=span, adjust=False).mean()

def calculate_bollinger(series, window=20, num_std=2):
    sma = series.rolling(window).mean()
    std = series.rolling(window).std()
    upper = sma + num_std * std
    lower = sma - num_std * std
    return sma, upper, lower

def calculate_stochastic(high, low, close, k=14, d=3):
    low_min  = low.rolling(window=k).min()
    high_max = high.rolling(window=k).max()
    percent_k = 100 * (close - low_min) / (high_max - low_min).replace(0,1e-10)
    percent_d = percent_k.rolling(window=d).mean()
    return percent_k, percent_d

def calculate_atr(high, low, close, window=14):
    tr1 = high - low
    tr2 = (high - close.shift()).abs()
    tr3 = (low  - close.shift()).abs()
    tr  = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    return tr.rolling(window).mean()

def calculate_obv(close, volume):
    """Calculate On-Balance Volume using vectorized operations instead of apply."""
    # Get price changes
    diff = close.diff()
    
    # Create direction array (-1, 0, or 1) using numpy's where
    direction = np.zeros_like(diff)
    direction = np.where(diff > 0, 1, direction)
    direction = np.where(diff < 0, -1, direction)
    
    # Calculate cumulative sum of volume * direction
    return (volume * direction).cumsum()

def get_column(df, col_name):
    """Safely access columns regardless of column index structure."""
    if isinstance(df.columns, pd.MultiIndex):
        # Try both options - first level only or first level + empty string
        if col_name in df.columns.get_level_values(0):
            return df[col_name]
        else:
            return df[(col_name, '')]
    else:
        return df[col_name]

def generate_features(df):
    feats = pd.DataFrame(index=df.index)
    
    # Get required columns safely
    close = get_column(df, "Close") 
    high = get_column(df, "High")
    low = get_column(df, "Low")
    volume = get_column(df, "Volume")
    
    # Convert to Series if they are DataFrames
    if isinstance(close, pd.DataFrame):
        close = close.iloc[:, 0]
    if isinstance(high, pd.DataFrame):
        high = high.iloc[:, 0]
    if isinstance(low, pd.DataFrame):
        low = low.iloc[:, 0]
    if isinstance(volume, pd.DataFrame):
        volume = volume.iloc[:, 0]
    
    # Technical indicators
    feats["RSI"] = calculate_rsi(close)
    macd, macd_sig = calculate_macd(close)
    feats["MACD"], feats["MACD_Signal"] = macd, macd_sig
    feats["SMA_20"] = calculate_sma(close)
    feats["EMA_20"] = calculate_ema(close)
    bb_mid, bb_up, bb_low = calculate_bollinger(close)
    feats["BB_Mid"], feats["BB_Upper"], feats["BB_Lower"] = bb_mid, bb_up, bb_low
    st_k, st_d = calculate_stochastic(high, low, close)
    feats["Stoch_%K"], feats["Stoch_%D"] = st_k, st_d
    feats["ATR"] = calculate_atr(high, low, close)
    feats["OBV"] = calculate_obv(close, volume)
    
    # Engineered differences & ratios
    feats["RSI_diff"] = feats["RSI"] - feats["RSI"].shift(3)
    feats["MACD_diff"] = feats["MACD"] - feats["MACD_Signal"]
    
    # Ensure SMA_20 is a Series for division
    sma_20 = feats["SMA_20"]
    if isinstance(sma_20, pd.DataFrame):
        sma_20 = sma_20.iloc[:, 0]
    
    feats["Price_vs_SMA"] = close / sma_20
    
    # Get sentiment column safely
    sentiment_col = get_column(df, "Sentiment")
    if isinstance(sentiment_col, pd.DataFrame):
        sentiment_col = sentiment_col.iloc[:, 0]
    
    feats["Sentiment"] = sentiment_col
    
    return feats.dropna()

# Apply to one ticker's data
sample_df = all_data[tickers[0]].copy()
features_df = generate_features(sample_df)
features_df.tail()


Unnamed: 0_level_0,RSI,MACD,MACD_Signal,SMA_20,EMA_20,BB_Mid,BB_Upper,BB_Lower,Stoch_%K,Stoch_%D,ATR,OBV,RSI_diff,MACD_diff,Price_vs_SMA,Sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2024-12-24,76.180657,6.067327,5.390177,246.023002,246.397574,246.023002,259.105495,232.940509,99.939373,98.044297,4.113335,1312412000.0,2.474417,0.67715,1.048343,0.06152
2024-12-26,76.812079,6.300019,5.572145,247.219685,247.572615,247.219685,260.358798,234.080573,94.114346,97.138301,4.117617,1339649000.0,0.871943,0.727874,1.046581,0.06152
2024-12-27,69.300685,6.137217,5.685159,248.251551,248.309441,248.251551,260.449258,236.053844,75.422374,89.825365,4.360919,1297294000.0,-4.564697,0.452057,1.02843,0.06152
2024-12-30,58.382055,5.669595,5.682047,248.994234,248.653592,248.994234,260.083639,237.904829,46.477039,72.004587,4.314541,1261736000.0,-17.798602,-0.012451,1.011762,0.06152
2024-12-31,53.982582,5.096776,5.564992,249.53514,248.795628,249.53514,259.651778,239.418502,32.87106,51.590158,4.384464,1222255000.0,-22.829497,-0.468217,1.002444,0.06152


  df = pd.read_csv(fn, index_col=0, parse_dates=True)


⬇️ Downloading prices for AAPL
⬇️ Fetching sentiment for AAPL on 2024-01-01
Date parsing error: time data 'Today' does not match format '%b-%d-%y' - using Monday
Found 100 headlines. First 3:
  - 2024-01-01: Warren Buffett Steps Down as CEO and Other Key Tak... (0.00)
  - 2024-01-01: Buffett's $348B Signal: Why He's Trimming the Fat ... (0.00)
  - 2024-01-01: Apple CEO sends blunt message on tariffs impact... (0.00)
Average sentiment score: 0.0615
✓ Using cached sentiment for AAPL on 2024-01-01
✓ Using cached sentiment for AAPL on 2024-01-01
✓ Using cached sentiment for AAPL on 2024-01-01
⬇️ Fetching sentiment for AAPL on 2024-01-08


  sentiment_df = pd.concat([sentiment_df, new_entry], ignore_index=True)


Date parsing error: time data 'Today' does not match format '%b-%d-%y' - using Monday
Found 100 headlines. First 3:
  - 2024-01-08: Warren Buffett Steps Down as CEO and Other Key Tak... (0.00)
  - 2024-01-08: Buffett's $348B Signal: Why He's Trimming the Fat ... (0.00)
  - 2024-01-08: Apple CEO sends blunt message on tariffs impact... (0.00)
Average sentiment score: 0.0615
✓ Using cached sentiment for AAPL on 2024-01-08
✓ Using cached sentiment for AAPL on 2024-01-08
✓ Using cached sentiment for AAPL on 2024-01-08
✓ Using cached sentiment for AAPL on 2024-01-08
⬇️ Fetching sentiment for AAPL on 2024-01-15
Date parsing error: time data 'Today' does not match format '%b-%d-%y' - using Monday
Found 100 headlines. First 3:
  - 2024-01-15: Warren Buffett Steps Down as CEO and Other Key Tak... (0.00)
  - 2024-01-15: Buffett's $348B Signal: Why He's Trimming the Fat ... (0.00)
  - 2024-01-15: Apple CEO sends blunt message on tariffs impact... (0.00)
Average sentiment score: 0.0615
✓ Using cac

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[('Sentiment', '')].ffill(inplace=True)


## Prediction & Buy/Sell Signals
Application Files: train_models.py

## Market-Regime & Risk Filters
Application Files: filters.py & mass_backtesting.py

## Model Performance


## Backtesting

## Perfomance Visualisations

## Benchmark Comparisons

## Conclusion