In [123]:
import pandas as pd
import numpy as np
import re
import glob
import os

folder_path = '/Volumes/Extreme SSD/SET_Intern/SET/MarginModel/MM_History/2025'
os.chdir(folder_path)

In [124]:
def process_margin_file(file_path):
    """
    Processes a single margin announcement Excel file to extract data.

    Args:
        file_path (str): The full path to the Excel file.

    Returns:
        pandas.DataFrame: A DataFrame containing the extracted data,
                          or None if processing fails.
    """
    print(f"Processing file: {os.path.basename(file_path)}...")
    try:
        # --- Extract the date (from B2) ---
        df_raw = pd.read_excel(file_path, sheet_name=0, header=None)
        date_value = df_raw.iloc[1, 1] # B2 is at index 1 for row and column

        effective_date = 'Date not found'
        if pd.notna(date_value) and isinstance(date_value, str):
            match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_value)
            if match:
                effective_date = match.group(1)

        # --- Extract the main data ---
        df = pd.read_excel(file_path, sheet_name=0, header=3)

        # Check if expected columns exist before renaming
        if 'Underlying' not in df.columns or 'Short Options Minimum Charge' not in df.columns:
            print(f"Warning: Expected columns not found in {os.path.basename(file_path)}. Skipping.")
            return None

        df.rename(columns={'Underlying': 'Product',
                           'Short Options Minimum Charge': 'SOMC'},
                  inplace=True)

        data = []
        current_market = None
        for index, row in df.iterrows():
            product_value = row.get('Product')
            somc = row.get('SOMC')

            if pd.notna(product_value):
                if pd.isna(somc):
                    market_match = re.match(r'\d+\.\s*(.*)', str(product_value))
                    if market_match:
                        current_market = market_match.group(1).strip()
                elif pd.notna(somc):
                    try:
                        somc = float(somc)
                        data.append({
                            'Product': str(product_value).strip(),
                            'Market': current_market,
                            'Type': 'Option',
                            'SOMC': somc,
                            'Effective Date': effective_date
                        })
                    except (ValueError, TypeError):
                         print(f"Warning: Could not convert SOMC '{somc}' to float in {os.path.basename(file_path)}. Skipping row.")

        if not data:
             print(f"Warning: No product data found in {os.path.basename(file_path)}. Skipping.")
             return None

        return pd.DataFrame(data)

    except Exception as e:
        print(f"Error processing {os.path.basename(file_path)}: {e}. Skipping.")
        return None

In [125]:
# --- Main part of the script ---
excel_files = glob.glob(os.path.join(folder_path, '*.xlsx'))

if not excel_files:
    print(f"No Excel files (.xlsx) found in the folder: {folder_path}")
else:
    print(f"Found {len(excel_files)} Excel files to process.")

    all_dataframes = []

    for file in excel_files:
        if not os.path.basename(file).startswith('~'):
            df_single = process_margin_file(file)
            if df_single is not None:
                all_dataframes.append(df_single)

    if all_dataframes:
        merged_df = pd.concat(all_dataframes, ignore_index=True)
        merged_df['SOMC'] = merged_df['SOMC'].apply(lambda x: x if x !=0 else np.nan)
        merged_df = merged_df.dropna(subset=['SOMC'])

        # Remove duplicate rows
        rows_before_dedupe = len(merged_df)
        merged_df = merged_df.drop_duplicates(keep='first', ignore_index=True) # <<< NEW LINE
        rows_after_dedupe = len(merged_df)
        print(f"Removed {rows_before_dedupe - rows_after_dedupe} duplicate rows.")
        
        # Display results
        print("\n--- Merged DataFrame (Cleaned) ---")
        print(f"Total rows: {len(merged_df)}")
        print("First 5 rows:")
        print(merged_df.head())
        print("\nLast 5 rows:")
        print(merged_df.tail())

    #     # Save the merged DataFrame
    #     output_filename = 'merged_margin_data_cleaned.csv'
    #     merged_df.to_csv(output_filename, index=False)
    #     print(f"\nSuccessfully merged data and saved to {output_filename}")
    # else:
    #     print("\nNo data was successfully processed from any file.")

Found 16 Excel files to process.
Processing file: Margin_Announcement_20250120-EN.xlsx...
Processing file: Margin_Announcement_20250318-EN.xlsx...
Processing file: Margin_Announcement_20250425-EN.xlsx...
Processing file: Margin_Announcement_20250226-EN.xlsx...
Processing file: Margin_Announcement_20250408-EN.xlsx...


  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""

Processing file: Margin_Announcement_20250218-EN.xlsx...
Processing file: Margin_Announcement_20250514.xlsx...
Processing file: Margin_Announcement_20250109-EN.xlsx...
Processing file: Margin_Announcement_20250522.xlsx...
Processing file: Margin_Announcement_20250507-EN.xlsx...
Processing file: Margin_Announcement_20250422-EN.xlsx...
Processing file: Margin_Announcement_20250306-EN.xlsx...
Processing file: Margin_Announcement_20250401 - EN.xlsx...
Processing file: Margin_Announcement_20250416-EN.xlsx...
Processing file: Margin_Announcement_20250509.xlsx...
Processing file: Margin_Announcement_20250206-EN.xlsx...
Removed 1 duplicate rows.

--- Merged DataFrame (Cleaned) ---
Total rows: 14
First 5 rows:
         Product        Market    Type   SOMC    Effective Date
0  SET50 Options  Index Market  Option   60.0   24 January 2025
1  SET50 Options  Index Market  Option  100.0     21 March 2025
2  SET50 Options  Index Market  Option  100.0     30 April 2025
3  SET50 Options  Index Market  O

  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")


In [126]:
os.chdir('/Users/fulinq/Documents/SET/MarginModel/SOMC_History')
year = folder_path.split('/')[-1]
file_name = f'{year}_merged_SOMC_data.xlsx'
with pd.ExcelWriter(file_name) as writer:
    merged_df.to_excel(writer, index=False, sheet_name='SOMC Data')