In [1]:
#Importing drive model from the google.colab package
from google.colab import drive

#Mounting the google drive to a specific path
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/GitHub_Projects/FTSE_350_Anomaly_Detection

/content/drive/MyDrive/GitHub_Projects/FTSE_350_Anomaly_Detection


In [3]:
# Configuring Git user details
!git config --global user.email "dorothy.sarpongk@gmail.com"
!git config --global user.name "01DorSarpong"

In [37]:
# Importing libraries for code

import pandas as pd
import numpy  as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tqdm import tqdm


In [32]:
def download_and_save_FTSE_stocks(tickers: list, start_date: str, end_date: str, directory: str):
  """
  This function downloads historical stock data for a list of tickers and saves it to CSV files.

  Args:
    tickers_list (list): A list of stock ticker symbols (e.g., ['TSCO.L', 'BARC.L']).
    start_date (str): The start date for data download in 'YYYY-MM-DD' format.
    end_date (str): The end date for data download in 'YYYY-MM-DD' format.
    directory (str): The path to the directory where CSV files will be saved.
  """

  # Ensure the save directory exists
  if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Created directory: {directory}")

  print(f"Starting download for {len(tickers)} tickers from {start_date} to {end_date}...")

  for ticker in tqdm(tickers, desc="Downloading Stocks"):
    # Format the filename: remove '.L' and add date range for clarity
    cleaned_ticker = ticker.replace('.L', '')
    file_name = f"{cleaned_ticker}_{start_date.replace('-', '')}_{end_date.replace('-', '')}.csv"
    full_file_path = os.path.join(directory, file_name)

    try:
      df = yf.download(ticker, start=start_date, end=end_date, auto_adjust=False)

      if not df.empty:
        df.to_csv(full_file_path)
        # print(f"✅ Saved data for {ticker} to {full_file_path}") # Optional: uncomment for more verbose output
      else:
        print(f"⚠️ No data available for {ticker} for the specified period.")
    except Exception as e:
      print(f"❌ Error downloading or saving data for {ticker}: {e}")

  print("Download process completed.")


In [33]:
#Creating a list of FTSE 100 and FTSE 250 tickers

FTSE_100_tickers = ["AZN.L", "HSBA.L", "ULVR.L", "REL.L", "BATS.L", "BP.L", "GSK.L", "DGE.L",
                   "RR.L", "NG.L", "BARC.L", "TSCO.L", "PRU.L", "BHP.L", "BT-A.L",]

FTSE_250_tickers = ["BWY.L", "EMG.L", "JUST.L", "SXS.L", "CKN.L", "LRE.L", "RAT.L", "THG.L",
                    "JDW.L", "SCT.L", "DOM.L", "SRE.L", "HIK.L", "ICGT.L", "HSX.L"]

In [34]:
#Defining the period for stocks range
start_date = "2014-01-01"
end_date = "2024-12-31"

In [35]:
# Defining the path to save the CSVs

ftse_100_path = '/content/drive/MyDrive/GitHub_Projects/FTSE_350_Anomaly_Detection/FTSE_100'
ftse_250_path = '/content/drive/MyDrive/GitHub_Projects/FTSE_350_Anomaly_Detection/FTSE_250'


In [36]:
# Calling the function for FTSE 100 and FTSE 250 tickers

download_and_save_FTSE_stocks(
    tickers=FTSE_100_tickers,
    start_date=start_date,
    end_date=end_date,
    directory=ftse_100_path
)

download_and_save_FTSE_stocks(
    tickers=FTSE_250_tickers,
    start_date=start_date,
    end_date=end_date,
    directory=ftse_250_path
)

Starting download for 15 tickers from 2014-01-01 to 2024-12-31...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Download process completed.
Starting download for 15 tickers from 2014-01-01 to 2024-12-31...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Download process completed.





In [38]:
# Creating a function to load stocks and pre-process into a dataframe

def load_and_structure_stock_data(folder_path: str) -> pd.DataFrame:
    """
    Loads historical stock data from CSV files in a specified folder,
    cleans and processes each DataFrame, and consolidates them into
    a single structured DataFrame with a MultiIndex (Date, Ticker).

    Args:
        folder_path (str): The path to the directory containing stock data CSVs.

    Returns:
        pd.DataFrame: A single DataFrame containing data for all tickers,
                      indexed by 'Date' and 'Ticker', sorted by Date and then Ticker.
                      Returns an empty DataFrame if no data is loaded.
    """
    all_dfs = []

    if not os.path.exists(folder_path):
        print(f"❌ Error: Folder not found at {folder_path}")
        return pd.DataFrame() # Return empty DataFrame if folder doesn't exist

    # Use os.scandir for potentially better performance with many files
    # and tqdm for a progress bar if there are many files.
    file_list = [f.name for f in os.scandir(folder_path) if f.name.endswith(".csv")]

    if not file_list:
        print(f"⚠️ No CSV files found in {folder_path}")
        return pd.DataFrame()

    print(f"Loading data from {len(file_list)} CSV files in {folder_path}...")

    for filename in tqdm(file_list, desc="Processing Stock Files"):
        file_path = os.path.join(folder_path, filename)

        try:
            # Read CSV: use first row as header, skip second row (which often contains ticker name repeated)
            # The original yfinance CSV header is often 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'
            # and sometimes it puts the ticker symbol in the second row for some tools/configs,
            # so skiprows=[1] is a good precaution if that's how yfinance saves it for you.
            df = pd.read_csv(file_path, header=0, skiprows=[1], encoding='utf-8-sig')

            # --- Data Cleaning and Preparation ---

            # Rename the first column to 'Date' if it's not already, and handle potential variants
            if df.columns[0].lower().strip() != 'date':
                df.rename(columns={df.columns[0]: 'Date'}, inplace=True)

            # Normalize column names to lowercase and remove leading/trailing spaces
            df.columns = [col.strip().lower() for col in df.columns]

            # Clean and prepare the 'date' column: ensure string, strip spaces, drop 'Date' literals
            df['date'] = df['date'].astype(str).str.strip()
            df = df[df['date'].str.lower() != 'date'] # Drop any rows where the date column contains the string 'Date'

            # Convert to datetime, coercing errors will turn unparseable dates into NaT
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df.dropna(subset=['date'], inplace=True) # Drop rows where date conversion failed (NaT)

            # Set datetime index
            df.set_index('date', inplace=True)

            # Drop rows missing essential data (e.g., open price). Assuming 'open' is critical.
            if 'open' in df.columns:
                df.dropna(subset=['open'], inplace=True)
            else:
                print(f"⚠️ 'open' column not found in {filename}. Skipping this file.")
                continue  # Skip this file if 'open' is missing

            # --- Add Ticker Information ---
            # Extract ticker from filename (assuming format like TICKER_STARTDATE_ENDDATE.csv)
            # Example: 'TSCO_20140101_20241231.csv' -> 'TSCO'
            ticker_symbol = filename.split('_')[0]
            df['ticker'] = ticker_symbol # Add a new 'ticker' column to this DataFrame

            all_dfs.append(df)

        except pd.errors.EmptyDataError:
            print(f"⚠️ {filename} is empty. Skipping.")
        except pd.errors.ParserError as e:
            print(f"❌ Error parsing {filename}: {type(e).__name__} - {e}. Skipping.")
        except Exception as e:
            print(f"❌ Error processing {filename}: {type(e).__name__} - {e}. Skipping.")

    if not all_dfs:
        print("No valid stock data loaded.")
        return pd.DataFrame()

    # Concatenate all individual DataFrames into one
    combined_df = pd.concat(all_dfs)

    # Set a MultiIndex: primary index is 'date', secondary is 'ticker'
    # Ensure 'ticker' is part of the index for efficient slicing and grouping
    combined_df.set_index('ticker', append=True, inplace=True)
    combined_df.index.names = ['Date', 'Ticker']

    # Sort the MultiIndex for better performance and consistency
    combined_df.sort_index(inplace=True)

    print("All stock data loaded and structured successfully.")
    return combined_df

In [None]:
#Calling the function

structured_ftse_data = load_and_structure_stock_data(ftse_100_path)

# Display the first few rows and info of the combined DataFrame
if not structured_ftse_data.empty:
  print("\nCombined DataFrame Head:")
  print(structured_ftse_data.head())
  print("\nCombined DataFrame Info:")
  structured_ftse_data.info()
  print("\nNumber of unique tickers:", structured_ftse_data.index.get_level_values('Ticker').nunique())

In [None]:
!git add .
!git commit -m "Added a new function to load and process stocks into a dataframe"
!git push origin main
