<a href="https://colab.research.google.com/github/Ape108/Quantum_Portfolio_Optimization/blob/main/STTM_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Environment Setup

In [None]:
import pandas as pd
import os
from google.colab import drive
from datetime import datetime, timedelta
import yfinance as yf
import random
import time
from torch.utils.data import Dataset
import json
import torch
import torch.nn as nn
from torch.nn import MSELoss
from torch.optim import Adam
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader, random_split
import shutil
from torch.cuda.amp import autocast, GradScaler
from concurrent.futures import ThreadPoolExecutor, as_completed


In [None]:
drive.mount('/content/drive')

In [None]:
# Define base directory on Google Drive
base_dir = "/content/drive/MyDrive/StockProject/StockProjectDataPrep"
data_dir = os.path.join(base_dir, "data")
log_dir = os.path.join(base_dir, "logs")
ticker_path = os.path.join(base_dir, "tickers", "all_us_tickers.csv")
valid_tickers_path = os.path.join(base_dir, "valid_tickers.csv")
ticker_log_path = os.path.join(log_dir, "ticker_changes.txt")

os.makedirs(data_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
os.makedirs(os.path.join(base_dir, "tickers"), exist_ok=True)

# === CONFIG ===
history_window = 30
prediction_window = 7
min_required_days = history_window + prediction_window
start_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')

#Load Tickers

In [None]:
# === 1. LOAD RAW TICKERS ===
def load_all_us_tickers():
    try:
        nasdaq_url = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
        other_url = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/otherlisted.txt"

        nasdaq = pd.read_csv(nasdaq_url, sep="|")
        nasdaq = nasdaq[nasdaq['Test Issue'] == 'N']
        nasdaq_tickers = nasdaq['Symbol'].dropna().astype(str).tolist()

        other = pd.read_csv(other_url, sep="|")
        other = other[other['Test Issue'] == 'N']
        nyse_tickers = other['ACT Symbol'].dropna().astype(str).tolist()

        all_tickers = sorted(list(set(nasdaq_tickers + nyse_tickers)))

        # Log changes
        if os.path.exists(ticker_path):
            old_df = pd.read_csv(ticker_path)
            old_tickers = set(old_df['Ticker'])
            new_tickers = set(all_tickers)

            added = sorted(new_tickers - old_tickers)
            removed = sorted(old_tickers - new_tickers)

            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            with open(ticker_log_path, "a") as log:
                log.write(f"\n[{timestamp}]\n")
                log.write(f"Added ({len(added)}): {', '.join(added)}\n")
                log.write(f"Removed ({len(removed)}): {', '.join(removed)}\n")

        pd.DataFrame({"Ticker": all_tickers}).to_csv(ticker_path, index=False)
        print(f"✅ Saved {len(all_tickers)} tickers to: {ticker_path}")
    except Exception as e:
        print(f"❌ Failed to load tickers: {e}")

#Historical Data

In [None]:
# === 2. DOWNLOAD TICKER HISTORICAL DATA WITH RETRY ===
def download_ticker_data(ticker, start=start_date, max_retries=3):
    for attempt in range(max_retries):
        try:
            df = yf.download(ticker, start=start, progress=False)
            if df.empty:
                raise ValueError("No data returned")
            df.reset_index().to_csv(os.path.join(data_dir, f"{ticker}.csv"), index=False)
            return True
        except Exception as e:
            if attempt == max_retries - 1:
                with open(os.path.join(log_dir, "failed_downloads.txt"), "a") as f:
                    f.write(f"{ticker}: {e}\n")
            time.sleep(1)
    return False

#Validate Data

In [None]:
# === 3. VALIDATE DATA ===
def is_valid_ticker(ticker):
    path = os.path.join(data_dir, f"{ticker}.csv")
    if not os.path.exists(path):
        return False
    try:
        df = pd.read_csv(path)
        df = df.sort_values("Date")
        df = df[["Open", "High", "Low", "Close", "Volume"]].apply(pd.to_numeric, errors="coerce")
        df = df.dropna()

        if len(df) < min_required_days:
            print(f"❌ {ticker} skipped (only {len(df)} valid rows)")
            return False

        df = df[-min_required_days:]
        df = df[["Open", "High", "Low", "Close", "Volume"]].apply(pd.to_numeric, errors='coerce')
        if df.isnull().values.any():
            return False
        return True
    except Exception as e:
        with open(os.path.join(log_dir, "corrupt_tickers.txt"), "a") as f:
            f.write(f"{ticker}: {e}\n")
        return False


#Parallel Download Wrapper

In [None]:
# === 4. PARALLEL DOWNLOAD WRAPPER ===
def download_all_tickers(tickers, max_workers=8):
    print(f"🔄 Downloading {len(tickers)} tickers in parallel with {max_workers} workers...")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(download_ticker_data, t): t for t in tickers}
        for future in as_completed(futures):
            ticker = futures[future]
            try:
                future.result()
            except Exception as e:
                with open(os.path.join(log_dir, "failed_downloads.txt"), "a") as f:
                    f.write(f"{ticker} (thread error): {e}\n")

In [None]:
load_all_us_tickers()

In [None]:
tickers = pd.read_csv(ticker_path)['Ticker'].tolist()
download_all_tickers(tickers, max_workers=8)

✅ Saved 11328 tickers to: /content/drive/MyDrive/StockProject/StockProjectDataPrep/tickers/all_us_tickers.csv
🔄 Downloading 11328 tickers in parallel with 8 workers...
YF.download() has changed argument auto_adjust default to True


ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT.U']: HTTPError('HTTP Error 401: ')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT.U']: HTTPError('HTTP Error 401: ')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT.U']: HTTPError('HTTP Error 401: ')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT.U']: HTTPError('HTTP Error 401: ')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT.U']: HTTPError('HTTP Error 401: ')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT.U']: HTTPError('HTTP Error 401: ')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT.U']: HTTPError('HTTP Error 401: ')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['AACT.U']: HTTPError('HTTP Error 401: ')
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['ABLVW']: YFInvalidPeriodError("ABLVW: Period 'max' is invalid, must be of the format 1d, 5d, etc.")
ERROR:yfinance:
1 Failed download:
ERROR:y

✅ 0 valid tickers saved to: /content/drive/MyDrive/StockProject/StockProjectDataPrep/valid_tickers.csv


In [None]:
valid = [t for t in tickers if is_valid_ticker(t)]
pd.DataFrame({"Ticker": valid}).to_csv(valid_tickers_path, index=False)
print(f"✅ {len(valid)} valid tickers saved to: {valid_tickers_path}")

❌ AAPG skipped (only 29 valid rows)
❌ AAPL skipped (only 29 valid rows)
❌ AAPR skipped (only 29 valid rows)
❌ AAPU skipped (only 29 valid rows)
❌ AAPW skipped (only 29 valid rows)
❌ AAPX skipped (only 29 valid rows)
❌ AAPY skipped (only 29 valid rows)
❌ AARD skipped (only 29 valid rows)
❌ ACVF skipped (only 27 valid rows)
❌ ACVT skipped (only 27 valid rows)
❌ ACWI skipped (only 27 valid rows)
❌ ACWV skipped (only 27 valid rows)
❌ ACWX skipped (only 27 valid rows)
❌ ACXP skipped (only 27 valid rows)
❌ ADAG skipped (only 27 valid rows)
❌ ADAP skipped (only 27 valid rows)
❌ ADVM skipped (only 18 valid rows)
❌ ADVWW skipped (only 18 valid rows)
❌ ADX skipped (only 18 valid rows)
❌ ADXN skipped (only 18 valid rows)
❌ AEE skipped (only 18 valid rows)
❌ AEF skipped (only 18 valid rows)
❌ AEFC skipped (only 18 valid rows)
❌ AEG skipped (only 18 valid rows)
❌ AEO skipped (only 18 valid rows)
❌ AEON skipped (only 18 valid rows)
❌ AEP skipped (only 18 valid rows)
❌ AER skipped (only 18 valid rows