In [1]:
#Importing drive model from the google.colab package
from google.colab import drive

#Mounting the google drive to a specific path
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/GitHub_Projects/FTSE_350_Anomaly_Detection

/content/drive/MyDrive/GitHub_Projects/FTSE_350_Anomaly_Detection


In [2]:
# Configuring Git user details
!git config --global user.email "dorothy.sarpongk@gmail.com"
!git config --global user.name "01DorSarpong"

In [19]:
# Importing libraries for code

import pandas as pd
import numpy  as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import os


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tqdm import tqdm
from typing import Tuple, Union


In [7]:
def download_and_save_FTSE_stocks(tickers: list, start_date: str, end_date: str, directory: str):
  """
  This function downloads historical stock data for a list of tickers and saves it to CSV files.

  Args:
    tickers_list (list): A list of stock ticker symbols (e.g., ['TSCO.L', 'BARC.L']).
    start_date (str): The start date for data download in 'YYYY-MM-DD' format.
    end_date (str): The end date for data download in 'YYYY-MM-DD' format.
    directory (str): The path to the directory where CSV files will be saved.
  """

  # Ensure the save directory exists
  if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Created directory: {directory}")

  print(f"Starting download for {len(tickers)} tickers from {start_date} to {end_date}...")

  for ticker in tqdm(tickers, desc="Downloading Stocks"):
    # Format the filename: remove '.L' and add date range for clarity
    cleaned_ticker = ticker.replace('.L', '')
    file_name = f"{cleaned_ticker}_{start_date.replace('-', '')}_{end_date.replace('-', '')}.csv"
    full_file_path = os.path.join(directory, file_name)

    try:
      df = yf.download(ticker, start=start_date, end=end_date, auto_adjust=False)

      if not df.empty:
        df.to_csv(full_file_path)
        # print(f"✅ Saved data for {ticker} to {full_file_path}") # Optional: uncomment for more verbose output
      else:
        print(f"⚠️ No data available for {ticker} for the specified period.")
    except Exception as e:
      print(f"❌ Error downloading or saving data for {ticker}: {e}")

  print("Download process completed.")


In [8]:
#Creating a list of FTSE 100 and FTSE 250 tickers

FTSE_100_tickers = ["AZN.L", "HSBA.L", "ULVR.L", "REL.L", "BATS.L", "BP.L", "GSK.L", "DGE.L",
                   "RR.L", "NG.L", "BARC.L", "TSCO.L", "PRU.L", "BHP.L", "BT-A.L",]

FTSE_250_tickers = ["BWY.L", "EMG.L", "JUST.L", "SXS.L", "CKN.L", "LRE.L", "RAT.L", "THG.L",
                    "JDW.L", "SCT.L", "DOM.L", "SRE.L", "HIK.L", "ICGT.L", "HSX.L"]

In [9]:
#Defining the period for stocks range
start_date = "2014-01-01"
end_date = "2024-12-31"

In [10]:
# Defining the path to save the CSVs

ftse_100_path = '/content/drive/MyDrive/GitHub_Projects/FTSE_350_Anomaly_Detection/FTSE_100'
ftse_250_path = '/content/drive/MyDrive/GitHub_Projects/FTSE_350_Anomaly_Detection/FTSE_250'


In [None]:
# Calling the function for FTSE 100 and FTSE 250 tickers

download_and_save_FTSE_stocks(
    tickers=FTSE_100_tickers,
    start_date=start_date,
    end_date=end_date,
    directory=ftse_100_path
)

download_and_save_FTSE_stocks(
    tickers=FTSE_250_tickers,
    start_date=start_date,
    end_date=end_date,
    directory=ftse_250_path
)

In [11]:
# Creating a function to load stocks and pre-process into a dataframe

def load_and_structure_stock_data(folder_path: str) -> pd.DataFrame:
    """
    Loads historical stock data from CSV files in a specified folder,
    cleans, processes, filters to weekdays, fills NaNs/gaps, and
    consolidates them into a single structured DataFrame with a MultiIndex.

    Args:
        folder_path (str): The path to the directory containing stock data CSVs.

    Returns:
        pd.DataFrame: A single DataFrame containing data for all tickers,
                      indexed by 'Date' and 'Ticker', sorted by Date and then Ticker.
                      Returns an empty DataFrame if no data is loaded or processed.
    """
    all_dfs = []

    if not os.path.exists(folder_path):
        print(f"❌ Error: Folder not found at {folder_path}")
        return pd.DataFrame() # Return empty DataFrame if folder doesn't exist

    # Get list of CSV files to process
    file_list = [f.name for f in os.scandir(folder_path) if f.name.endswith(".csv")]

    if not file_list:
        print(f"⚠️ No CSV files found in {folder_path}")
        return pd.DataFrame()

    print(f"Loading and processing data from {len(file_list)} CSV files in {folder_path}...")

    # Define columns that typically contain numerical stock data to be filled
    numerical_cols_to_fill = ['open', 'high', 'low', 'close', 'adj close', 'volume']

    for filename in tqdm(file_list, desc="Processing Stock Files"):
        file_path = os.path.join(folder_path, filename)

        try:
            # Read CSV: use first row as header, skip second row (often contains ticker name repeated)
            df = pd.read_csv(file_path, header=0, skiprows=[1], encoding='utf-8-sig')

            # --- Initial Cleaning and Date Conversion ---
            # Rename the first column to 'Date' if it's not already
            if df.columns[0].strip().lower() != 'date':
                df.rename(columns={df.columns[0]: 'Date'}, inplace=True)

            # Normalize all column names to lowercase and remove leading/trailing spaces
            df.columns = [col.strip().lower() for col in df.columns]

            # Clean and prepare the 'date' column
            df['date'] = df['date'].astype(str).str.strip()
            df = df[df['date'].str.lower() != 'date'] # Drop any rows where the date column contains the string 'Date'

            # Convert to datetime, coercing errors will turn unparseable dates into NaT
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df.dropna(subset=['date'], inplace=True) # Drop rows where date conversion failed (NaT)

            # --- Add Ticker Information ---
            # Extract ticker from filename (e.g., 'TSCO_20140101_20241231.csv' -> 'TSCO')
            ticker_symbol = filename.split('_')[0]
            df['ticker'] = ticker_symbol

            # --- Set Date as Index (temporarily for filtering/filling) ---
            df.set_index('date', inplace=True)

            # --- Filter for Weekdays Only ---
            # .dayofweek returns Monday=0, ..., Sunday=6. Keep only 0 to 4.
            df = df[df.index.dayofweek < 5]

            # --- Fill NaNs/Gaps for Numerical Columns ---
            # Identify columns to fill that actually exist in the current DataFrame
            existing_numerical_cols = [col for col in numerical_cols_to_fill if col in df.columns]

            if not existing_numerical_cols:
                # print(f"⚠️ No numerical columns found for {filename}. Skipping NaN filling for this file.")
                pass # This is fine, just means the file might only have non-numerical data or very specific columns

            # Apply forward-fill then backward-fill for numerical columns within this ticker's data
            # This handles gaps for individual stock series
            df[existing_numerical_cols] = df[existing_numerical_cols].ffill().bfill()

            # --- Final Check for essential data after filling ---
            # If 'open' column is vital and still has NaNs (e.g., entire series was NaN), drop those rows
            if 'open' in df.columns:
                df.dropna(subset=['open'], inplace=True)
            else:
                print(f"⚠️ 'open' column not found in {filename}. Skipping this file as essential data is missing.")
                continue # Skip this file if 'open' is genuinely missing

            if df.empty:
                print(f"⚠️ No valid weekday data remaining for {filename} after filtering. Skipping.")
                continue

            all_dfs.append(df)

        except pd.errors.EmptyDataError:
            print(f"⚠️ {filename} is empty. Skipping.")
        except pd.errors.ParserError as e:
            print(f"❌ Error parsing {filename}: {type(e).__name__} - {e}. Skipping.")
        except Exception as e:
            print(f"❌ Error processing {filename}: {type(e).__name__} - {e}. Skipping.")

    if not all_dfs:
        print("No valid stock data loaded after processing. Returning empty DataFrame.")
        return pd.DataFrame()

    # --- Consolidate all DataFrames into one structured DataFrame ---
    combined_df = pd.concat(all_dfs)

    # Set a MultiIndex: primary index is 'Date', secondary is 'ticker'
    # 'ticker' was added as a regular column inside the loop, now it becomes part of the index
    combined_df.set_index('ticker', append=True, inplace=True)
    combined_df.index.names = ['Date', 'Ticker']

    # Sort the MultiIndex for better performance and consistency
    combined_df.sort_index(inplace=True)

    print("All stock data loaded, structured, filtered, and filled successfully.")
    return combined_df



In [None]:
#Calling the function to load and structure df for FTSE 100 stocks

ready_ftse100_data = load_and_structure_stock_data(ftse_100_path)

if not ready_ftse100_data.empty:
  print("\n--- Final Structured and Cleaned DataFrame ---")
  print(ready_ftse100_data.head(15)) # Show more rows to see multiple dates/tickers
  print("\nDataFrame Info:")
  ready_ftse100_data.info()
  print("\nSample of weekdays (should only be Mon-Fri):")
  print(ready_ftse100_data.index.get_level_values('Date').day_name().value_counts())
  print("\nNaNs after processing (should be very few or none in numerical columns):")
  print(ready_ftse100_data.isnull().sum())
else:
 print("No data was successfully loaded and processed.")

In [None]:
#Calling the function to load and structure df for FTSE 250 stocks

ready_ftse250_data = load_and_structure_stock_data(ftse_250_path)

if not ready_ftse250_data.empty:
  print("\n--- Final Structured and Cleaned DataFrame ---")
  print(ready_ftse250_data.head(15)) # Show more rows to see multiple dates/tickers
  print("\nDataFrame Info:")
  ready_ftse250_data.info()
  print("\nSample of weekdays (should only be Mon-Fri):")
  print(ready_ftse250_data.index.get_level_values('Date').day_name().value_counts())
  print("\nNaNs after processing (should be very few or none in numerical columns):")
  print(ready_ftse250_data.isnull().sum())
else:
 print("No data was successfully loaded and processed.")

In [14]:
# A function to genereate the stock features needed to build ML model

def generate_all_stock_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates a comprehensive set of time-series and cross-sectional numerical features
    for stock data, suitable for unsupervised anomaly detection models.

    This function assumes the input DataFrame has a MultiIndex (Date, Ticker)
    and contains cleaned base columns like 'open', 'high', 'low', 'close',
    'adj close', and 'volume', with no critical NaNs.

    Args:
        df (pd.DataFrame): Your clean DataFrame with MultiIndex (Date, Ticker)
                           and base stock price/volume data.

    Returns:
        pd.DataFrame: The DataFrame with all original and newly engineered numerical
                      features. NaNs introduced by calculations (e.g., at the start
                      of rolling windows) will be present.
    """
    if df.empty:
        print("Input DataFrame is empty for feature generation. Returning empty DataFrame.")
        return pd.DataFrame()

    processed_df = df.copy() # Always work on a copy to keep the original untouched

    # --- Ensure critical columns are numerical for calculations ---
    for col in ['open', 'high', 'low', 'close', 'adj close', 'volume']:
        if col in processed_df.columns:
            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
    processed_df.dropna(subset=['open','close','volume'], inplace=True)
    if processed_df.empty:
        print("No data remaining after ensuring essential columns are numerical and not NaN.")
        return pd.DataFrame()


    grouped_by_ticker = processed_df.groupby(level='Ticker')
    grouped_by_date = processed_df.groupby(level='Date')
    epsilon = 1e-9

    print("Generating Time-Series Based Features (per stock ticker)...")

    # 1. Return-Based Features:
    processed_df['log_return'] = grouped_by_ticker['close'].transform(
        lambda x: np.log(x / x.shift(1).replace(0, epsilon)).replace([-np.inf, np.inf], np.nan)
    )
    processed_df['simple_return'] = grouped_by_ticker['close'].transform(
        lambda x: (x / x.shift(1).replace(0, epsilon)) - 1
    )
    if 'adj close' in processed_df.columns:
        processed_df['log_adj_close_return'] = grouped_by_ticker['adj close'].transform(
            lambda x: np.log(x / x.shift(1).replace(0, epsilon)).replace([-np.inf, np.inf], np.nan)
        )
    processed_df['return_5d'] = grouped_by_ticker['close'].transform(lambda x: x.pct_change(periods=5))
    processed_df['return_20d'] = grouped_by_ticker['close'].transform(lambda x: x.pct_change(periods=20))

    print("Generating Volatility Measures...")
    # 2. Volatility Measures:
    processed_df['rolling_std_5d_log_return'] = grouped_by_ticker['log_return'].transform(lambda x: x.rolling(window=5).std())
    processed_df['rolling_std_20d_log_return'] = grouped_by_ticker['log_return'].transform(lambda x: x.rolling(window=20).std())

    if 'high' in processed_df.columns and 'low' in processed_df.columns:
        processed_df['daily_range_norm'] = (processed_df['high'] - processed_df['low']) / (processed_df['close'] + epsilon)

        log_high_div_low = np.log((processed_df['high'] / processed_df['low'].replace(0, epsilon)).clip(lower=epsilon))
        log_close_div_open = np.log((processed_df['close'] / processed_df['open'].replace(0, epsilon)).clip(lower=epsilon))

        gk_term = 0.5 * (log_high_div_low)**2 - (2 * np.log(2) - 1) * (log_close_div_open)**2
        gk_term[gk_term < 0] = np.nan # Set negative values before sqrt to NaN
        processed_df['garman_klass_vol'] = np.sqrt(gk_term)

        # Explicitly replace infs/NaNs that might result from sqrt or division by zero, and fill
        processed_df['garman_klass_vol'] = processed_df['garman_klass_vol'].replace([-np.inf, np.inf], np.nan)
        processed_df['garman_klass_vol'].fillna(0, inplace=True) # Fill with 0 for Garman-Klass specific NaNs
    else:
        print("⚠️ Missing 'high' or 'low' columns for some volatility features. Skipping.")

    print("Generating Volume-Based Features...")
    # 3. Volume-Based Features:
    processed_df['volume_change'] = grouped_by_ticker['volume'].transform(lambda x: x.pct_change(periods=1))
    processed_df['avg_volume_20d'] = grouped_by_ticker['volume'].transform(lambda x: x.rolling(window=20).mean())
    processed_df['relative_volume'] = processed_df['volume'] / (processed_df['avg_volume_20d'] + epsilon)

    print("Generating Momentum/Trend Indicators...")
    # 4. Momentum/Trend Indicators:
    processed_df['sma_5d'] = grouped_by_ticker['close'].transform(lambda x: x.rolling(window=5).mean())
    processed_df['sma_20d'] = grouped_by_ticker['close'].transform(lambda x: x.rolling(window=20).mean())
    processed_df['deviation_from_sma_20d'] = (processed_df['close'] - processed_df['sma_20d']) / (processed_df['sma_20d'] + epsilon)

    print("Generating Price-Volume Interaction Features...")
    # 5. Price-Volume Interaction Features:
    if 'high' in processed_df.columns and 'low' in processed_df.columns:
        processed_df['typical_price'] = (processed_df['high'] + processed_df['low'] + processed_df['close']) / 3
    else:
        print("⚠️ Missing 'high' or 'low' columns. Skipping 'typical_price'.")

    print("Generating Cross-Sectional Features (comparing stocks on the same day)...")
    # 6. Cross-Sectional Features:
    if 'log_return' in processed_df.columns:
        processed_df['daily_market_mean_log_return'] = grouped_by_date['log_return'].transform('mean')
        processed_df['daily_market_median_log_return'] = grouped_by_date['log_return'].transform('median')

        processed_df['deviation_from_daily_mean_return'] = processed_df['log_return'] - processed_df['daily_market_mean_log_return']
        processed_df['deviation_from_daily_median_return'] = processed_df['log_return'] - processed_df['daily_market_median_log_return']
        processed_df['daily_return_rank_pct'] = grouped_by_date['log_return'].rank(pct=True, method='average')
    else:
        print("⚠️ 'log_return' column not available for cross-sectional feature generation. Skipping these features.")

    # --- FINAL CLEANUP: Replace any remaining inf/-inf with NaN across all numerical columns ---
    print("Finalizing features: cleaning up any remaining inf/-inf values...")
    numerical_cols_after_gen = processed_df.select_dtypes(include=np.number).columns
    for col in numerical_cols_after_gen:
        processed_df[col] = processed_df[col].replace([np.inf, -np.inf], np.nan)

    print("Feature generation complete. NaNs from calculations are present and will need further handling.")
    return processed_df.sort_index()

In [None]:

# Call the feature generation function on the FTSE 100 stock

df_with_all_100_features = generate_all_stock_features(ready_ftse100_data)

if not df_with_all_100_features.empty:
    print("\n--- NEWLY GENERATED DataFrame with ALL Engineered Features (Head) ---")
    # To ensure you see all columns, even if there are many:
    pd.set_option('display.max_columns', None) # Display all columns
    pd.set_option('display.width', 1000)      # Ensure wide display in terminal/Colab output

    # Now, print the head of the *NEW* DataFrame
    print(df_with_all_100_features.head(15))

    print("\n--- NEWLY GENERATED DataFrame Info (should show many more columns) ---")
    # Now, print the info of the *NEW* DataFrame
    df_with_all_100_features.info()

    print("\n--- ALL Column Names in the NEW DataFrame ---")
    # This will explicitly list ALL column names, proving they are there
    print(df_with_all_100_features.columns.tolist())

    print("\n--- Count of NaNs per column (expect some NaNs from rolling/shifting, esp. at start of series) ---")
    print(df_with_all_100_features.isnull().sum().sort_values(ascending=False).head(20))
else:
    print("The feature generation function returned an empty DataFrame.")

In [None]:
# Call the feature generation function on the FTSE 100 stock

df_with_all_250_features = generate_all_stock_features(ready_ftse250_data)

if not df_with_all_250_features.empty:
    print("\n--- NEWLY GENERATED DataFrame with ALL Engineered Features (Head) ---")
    # To ensure you see all columns, even if there are many:
    pd.set_option('display.max_columns', None) # Display all columns
    pd.set_option('display.width', 1000)      # Ensure wide display in terminal/Colab output

    # Now, print the head of the *NEW* DataFrame
    print(df_with_all_250_features.head(15))

    print("\n--- NEWLY GENERATED DataFrame Info (should show many more columns) ---")
    # Now, print the info of the *NEW* DataFrame
    df_with_all_250_features.info()

    print("\n--- ALL Column Names in the NEW DataFrame ---")
    # This will explicitly list ALL column names, proving they are there
    print(df_with_all_250_features.columns.tolist())

    print("\n--- Count of NaNs per column (expect some NaNs from rolling/shifting, esp. at start of series) ---")
    print(df_with_all_250_features.isnull().sum().sort_values(ascending=False).head(20))
else:
    print("The feature generation function returned an empty DataFrame.")

In [21]:
# A function to remove all NaNs from features and scales values using standard scaler
#from sklearn.preprocessing import StandardScaler

def preprocess_features_for_model(
    df_with_all_features: pd.DataFrame,
    columns_to_exclude_from_features: list = None,
    apply_scaling: bool = True,
    scaler_obj: StandardScaler = None # Optional: provide a pre-fitted scaler for consistent scaling
) -> Tuple[pd.DataFrame, Union[StandardScaler, None]]:
    """
    Handles NaN removal, feature selection, and feature scaling for a DataFrame
    containing engineered stock features. This prepares the data for anomaly
    detection models.

    Args:
        df_with_engineered_features (pd.DataFrame): The DataFrame containing
                                                    all engineered features, with a MultiIndex.
                                                    (Output of `generate_all_stock_features`).
        columns_to_exclude_from_features (list, optional): A list of column names
                                                        that should NOT be treated as features
                                                        for the model (e.g., raw 'open', 'close',
                                                        or non-numeric helper columns like 'weekday').
                                                        If None, a default list is used.
        apply_scaling (bool): Whether to apply StandardScaler to numerical features.
                              Defaults to True. Highly recommended for most ML models.
        scaler_obj (StandardScaler, optional): An pre-fitted StandardScaler object.
                                               If `apply_scaling` is True and `scaler_obj` is None,
                                               a new scaler will be fitted. Useful for consistent
                                               scaling between train/test datasets.

    Returns:
        tuple: A tuple containing:
            - pd.DataFrame: The DataFrame with selected, cleaned, and optionally scaled
                            numerical features, ready for an anomaly detection model.
                            Retains the MultiIndex.
            - StandardScaler or None: The fitted or used StandardScaler object if
                                      scaling was applied, else None.
    """
    if df_with_all_features.empty:
        print("Input DataFrame is empty for preprocessing. Returning empty DataFrame.")
        return pd.DataFrame(), None

    processed_df = df_with_all_features.copy()
    print("\n--- Starting Feature Preprocessing for Model ---")

    # --- 1. Feature Selection (Identify Numerical Features to Use) ---
    # Default list of columns that are typically not features, but base data or helpers
    if columns_to_exclude_from_features is None:
        columns_to_exclude_from_features = ['open', 'high', 'low', 'close', 'adj close', 'volume', 'weekday']

    # Get all numerical columns from the DataFrame
    all_numerical_cols = processed_df.select_dtypes(include=np.number).columns.tolist()

    # Filter out the columns that should be excluded
    features_for_model_names = [
        col for col in all_numerical_cols
        if col not in columns_to_exclude_from_features
    ]

    if not features_for_model_names:
        print("⚠️ No valid numerical features identified after exclusion. Using all numerical original columns.")
        features_for_model_names = all_numerical_cols # Fallback to all if custom exclusion leads to empty list

    print(f"Selected {len(features_for_model_names)} features for the model.")
    df_features_only = processed_df[features_for_model_names].copy()


    # --- 2. NaN Removal (Final Handling for Model Input) ---
    # Drop rows where any of the *selected features* have NaNs.
    # This is critical as most ML models cannot handle NaNs.
    original_rows_count = df_features_only.shape[0]
    df_features_only.dropna(inplace=True)
    rows_after_nan_drop = df_features_only.shape[0]

    if original_rows_count > rows_after_nan_drop:
        print(f"Dropped {original_rows_count - rows_after_nan_drop} rows due to NaNs in selected features.")
    if df_features_only.empty:
        print("DataFrame is empty after NaN removal. Cannot proceed with preprocessing.")
        return pd.DataFrame(), None
    print(f"Data shape after NaN removal: {df_features_only.shape}")


    # --- 3. Feature Scaling ---
    scaler = None
    if apply_scaling:
        print("Applying StandardScaler to features...")
        scaler = scaler_obj if scaler_obj is not None else StandardScaler()

        # Fit and/or transform the features
        X_scaled = scaler.fit_transform(df_features_only) if scaler_obj is None else scaler.transform(df_features_only)

        # Convert scaled array back to DataFrame, retaining index and column names
        df_scaled_features = pd.DataFrame(X_scaled, index=df_features_only.index, columns=df_features_only.columns)
        print(f"Features scaled. Scaler: {'New' if scaler_obj is None else 'Existing'}.")
    else:
        df_scaled_features = df_features_only.copy()
        print("Skipping feature scaling.")

    print("--- Feature Preprocessing for Model Complete ---")
    return df_scaled_features, scaler

In [None]:
#Calling the preprocess_features_for_model function on the FTSE 100 and FTSE 250 stocks

final_processed_FTSE100_df, fitted_scaler = preprocess_features_for_model(
    df_with_all_100_features.copy(), # Pass a copy to avoid modifying the original df_with_all_features
    columns_to_exclude_from_features=None, # Use default exclusion (removes original price/volume/weekday)
    apply_scaling=True,                     # Apply StandardScaler (best for all 3 models)
    scaler_obj=None                         # Fit a new scaler
)
if not final_processed_FTSE100_df.empty:
    print("\n--- Processed DataFrame (Head) after fixing infs ---")
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    print(final_processed_FTSE100_df.head(15))

    print("\n--- Processed DataFrame (Info) after fixing infs ---")
    final_processed_FTSE100_df.info()

    print("\n--- NaNs in Processed DataFrame (should be 0) ---")
    print(final_processed_FTSE100_df.isnull().sum().sum()) # Should be 0 NaNs if successful
else:
    print("Preprocessing returned an empty DataFrame.")



final_processed_FTSE250_df, fitted_scaler = preprocess_features_for_model(
    df_with_all_250_features.copy(), # Pass a copy to avoid modifying the original df_with_all_features
    columns_to_exclude_from_features=None, # Use default exclusion (removes original price/volume/weekday)
    apply_scaling=True,                     # Apply StandardScaler (best for all 3 models)
    scaler_obj=None                         # Fit a new scaler
)
if not final_processed_FTSE250_df.empty:
    print("\n--- Processed DataFrame (Head) after fixing infs ---")
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    print(final_processed_FTSE100_df.head(15))

    print("\n--- Processed DataFrame (Info) after fixing infs ---")
    final_processed_FTSE250_df.info()

    print("\n--- NaNs in Processed DataFrame (should be 0) ---")
    print(final_processed_FTSE250_df.isnull().sum().sum()) # Should be 0 NaNs if successful
else:
    print("Preprocessing returned an empty DataFrame.")

In [25]:
!git add

fatal: not a git repository (or any of the parent directories): .git
