In [27]:
import pandas as pd
import re
import glob
import os

folder_path = '/Users/fulinq/Documents/SET/MarginModel/MM_History/2019'
os.chdir(folder_path)

In [28]:
def process_margin_file(file_path):
    """
    Processes a single margin announcement Excel file to extract data.

    Args:
        file_path (str): The full path to the Excel file.

    Returns:
        pandas.DataFrame: A DataFrame containing the extracted data,
                          or None if processing fails.
    """
    print(f"Processing file: {os.path.basename(file_path)}...")
    try:
        # --- Extract the date (from B2) ---
        df_raw = pd.read_excel(file_path, sheet_name=0, header=None)
        date_value = df_raw.iloc[1, 1] # B2 is at index 1 for row and column

        effective_date = 'Date not found'
        if pd.notna(date_value) and isinstance(date_value, str):
            match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_value)
            if match:
                effective_date = match.group(1)

        # --- Extract the main data ---
        df = pd.read_excel(file_path, sheet_name=0, header=3)

        # Check if expected columns exist before renaming
        if 'Underlying' not in df.columns or 'Maintenance Margin \n(per contract)' not in df.columns:
            print(f"Warning: Expected columns not found in {os.path.basename(file_path)}. Skipping.")
            return None

        df.rename(columns={'Underlying': 'Product',
                           'Maintenance Margin \n(per contract)': 'Maintenance_Margin'},
                  inplace=True)

        data = []
        current_market = None
        for index, row in df.iterrows():
            product_value = row.get('Product')
            maintenance_margin_value = row.get('Maintenance_Margin')

            if pd.notna(product_value):
                if pd.isna(maintenance_margin_value):
                    market_match = re.match(r'\d+\.\s*(.*)', str(product_value))
                    if market_match:
                        current_market = market_match.group(1).strip()
                elif pd.notna(maintenance_margin_value):
                    try:
                        margin = float(maintenance_margin_value)
                        data.append({
                            'Product': str(product_value).strip(),
                            'Market': current_market,
                            'Type': 'Future',
                            'Maintenance Margin': margin,
                            'Effective Date': effective_date
                        })
                    except (ValueError, TypeError):
                         print(f"Warning: Could not convert Maintenance Margin '{maintenance_margin_value}' to float in {os.path.basename(file_path)}. Skipping row.")

        if not data:
             print(f"Warning: No product data found in {os.path.basename(file_path)}. Skipping.")
             return None

        return pd.DataFrame(data)

    except Exception as e:
        print(f"Error processing {os.path.basename(file_path)}: {e}. Skipping.")
        return None

In [29]:
# --- Main part of the script ---
excel_files = glob.glob(os.path.join(folder_path, '*.xlsx'))

if not excel_files:
    print(f"No Excel files (.xlsx) found in the folder: {folder_path}")
else:
    print(f"Found {len(excel_files)} Excel files to process.")

    all_dataframes = []

    for file in excel_files:
        if not os.path.basename(file).startswith('~'):
            df_single = process_margin_file(file)
            if df_single is not None:
                all_dataframes.append(df_single)

    if all_dataframes:
        merged_df = pd.concat(all_dataframes, ignore_index=True)

        # Filter out rows where 'Product' is 'SET50 Options'
        initial_rows = len(merged_df)
        merged_df = merged_df[merged_df['Product'] != 'SET50 Options'].copy()
        rows_after_filter = len(merged_df)
        print(f"\nRemoved {initial_rows - rows_after_filter} rows where Product was 'SET50 Options'.")

        # Remove duplicate rows
        rows_before_dedupe = len(merged_df)
        merged_df = merged_df.drop_duplicates(keep='first', ignore_index=True) # <<< NEW LINE
        rows_after_dedupe = len(merged_df)
        print(f"Removed {rows_before_dedupe - rows_after_dedupe} duplicate rows.")


        # Display results
        print("\n--- Merged DataFrame (Cleaned) ---")
        print(f"Total rows: {len(merged_df)}")
        print("First 5 rows:")
        print(merged_df.head())
        print("\nLast 5 rows:")
        print(merged_df.tail())

        # Save the merged DataFrame
        output_filename = 'merged_margin_data_cleaned.csv'
        merged_df.to_csv(output_filename, index=False)
        print(f"\nSuccessfully merged data and saved to {output_filename}")
    else:
        print("\nNo data was successfully processed from any file.")

Found 17 Excel files to process.
Processing file: Margin_Announcement_120719en.xlsx...
Processing file: Margin_Announcement_020519en.xlsx...
Processing file: Margin_Announcement_230919en.xlsx...
Processing file: Margin_Announcement_260919EN.xlsx...


  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")


Processing file: Margin_Announcement_151119_MemberEN.xlsx...
Processing file: Margin_Announcement_181019EN.xlsx...
Processing file: Margin_Announcement_100619en.xlsx...
Processing file: Margin_Announcement_270619en.xlsx...


  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")


Processing file: Margin_Announcement_160819en.xlsx...
Processing file: Margin_Announcement_111219en.xlsx...
Processing file: Margin_Announcement_170519en.xlsx...


  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")


Processing file: Margin_Announcement_300719en.xlsx...
Processing file: MarginRate_110419_EN.xlsx...
Processing file: MarginAnnouncement_260419en.xlsx...
Processing file: Margin_Announcement_250419en.xlsx...
Processing file: Margin_Announcement_271219en.xlsx...
Processing file: Margin_Announcement_030919en.xlsx...

Removed 0 rows where Product was 'SET50 Options'.
Removed 0 duplicate rows.

--- Merged DataFrame (Cleaned) ---
Total rows: 2073
First 5 rows:
  Product        Market    Type  Maintenance Margin Effective Date
0    BANK  Index Market  Future             10600.0   12 July 2019
1    COMM  Index Market  Future             10300.0   12 July 2019
2   ENERG  Index Market  Future              6800.0   12 July 2019
3    FOOD  Index Market  Future              2400.0   12 July 2019
4     ICT  Index Market  Future              3880.0   12 July 2019

Last 5 rows:
     Product        Market    Type  Maintenance Margin    Effective Date
2068     TVO  Stock Market  Future               960

  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")


In [30]:
with pd.ExcelWriter('merged_margin_data.xlsx') as writer:
    merged_df.to_excel(writer, index=False, sheet_name='Margin Data')