In [1]:
import pandas as pd
import numpy as np

In [5]:
def clean_excel_data(file_path, output_path=None):
    """
    Clean Excel data with multi-row headers starting from row 4.
    Handles merged cells and preserves comma-separated header names.
    
    Parameters:
    file_path (str): Path to the input Excel file
    output_path (str): Path for the output file (optional)
    
    Returns:
    pd.DataFrame: Cleaned dataframe
    """

    df_raw = pd.read_excel(file_path, sheet_name="6.3", header=None, dtype=str)
    
    # Extract row 3 and row 4 (indices 2 and 3)
    header_row_1 = df_raw.iloc[2].fillna('')
    header_row_2 = df_raw.iloc[3].fillna('')
    
    # Combine headers: If row 4 has content, use it; otherwise use row 3
    # For merged cells, we'll concatenate with a space
    combined_headers = []
    
    for col_idx in range(len(header_row_1)):
        h1 = str(header_row_1.iloc[col_idx]).strip()
        h2 = str(header_row_2.iloc[col_idx]).strip()
        
        # Remove 'nan' strings that might appear
        h1 = '' if h1 == 'nan' else h1
        h2 = '' if h2 == 'nan' else h2
        
        # Combine headers intelligently
        if h1 and h2:
            # Both rows have content
            combined = f"{h1} - {h2}"
        elif h1:
            # Only row 3 has content
            combined = h1
        elif h2:
            # Only row 4 has content
            combined = h2
        else:
            # Neither has content, use column letter
            combined = f"Column_{col_idx}"
        
        combined_headers.append(combined)
    
    # Get the data starting from row 5 (index 4)
    df_clean = df_raw.iloc[4:].copy()
    
    # Set the combined headers
    df_clean.columns = combined_headers
    
    # Reset index
    df_clean.reset_index(drop=True, inplace=True)
    
    # Remove completely empty rows
    df_clean = df_clean.dropna(how='all')
    
    # Remove completely empty columns
    df_clean = df_clean.dropna(axis=1, how='all')
    
    # Clean up any remaining issues
    df_clean = df_clean.replace('nan', np.nan)
    
    # Save to output file if specified
    if output_path:
        df_clean.to_csv(output_path, index=False)
        print(f"Cleaned data saved to: {output_path}")
    
    return df_clean

In [6]:
def preview_headers(file_path, output_path):
    """
    Preview the first few rows including headers to verify structure.
    
    Parameters:
    file_path (str): Path to the Excel file
    num_rows (int): Number of data rows to preview
    """
    df_raw = pd.read_excel(file_path, sheet_name="6.3", header=None)
    
    # Clean and show result
    df_clean = clean_excel_data(file_path, output_path)
    
    return df_clean

In [7]:
input_file = "data/SUT and IO By Divisions -En.xlsx"    
output_file = "cleaned_data/sut_io_cleaned_data.csv"

# Preview the data
df = preview_headers(input_file, output_file)


Cleaned data saved to: cleaned_data/sut_io_cleaned_data.csv


In [8]:
df.head(5)

Unnamed: 0,Input-Output Table at Current Prices 2023 - Code,Input-Output Tables (IOTs) 2018 (Thousands of Saudi riyals) - Economic Activities (ISIC Rev. 4),"01 - Crop and animal production, hunting and related service activities",02 - Forestry and logging,03 - Fishing and aquaculture,05 - Mining of coal and lignite,06 - Extraction of crude petroleum and natural gas,07 - Mining of metal ores,08 - Other mining and quarrying activities,09 - Mining support service activities,...,Fixed capital Formation,Change in inventories,Gross capital formation,Export of goods,Petroleum Exports,Export of services,Total Export,Final Demand,Total imports,Total Output
0,1,"Crop and animal production, hunting and relate...",12412576.486796822,14988.073132406833,340447.7671211301,0.0,0.0,0.0,0.0,0.0,...,6854921.378346342,1444773.847100884,8299695.225447225,1464580.1105911315,0.0,4138767.036450658,5603347.147041789,86171020.42806634,38149722.7814436,135466284.0
1,2,Forestry and logging,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9110.666925780382,9110.666925780382,2348.876312570507,0.0,0.0,2348.876312570507,21075.109189987143,242740.0,416786.0
2,3,Fishing and aquaculture,0.0,0.0,162145.4335042523,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,32735.678445554728,0.0,109630.52640992276,142366.20485547747,3597559.754252812,701764.9999999999,5188960.999999999
3,5,Mining of coal and lignite,9.136619595328597,0.1018741097449567,0.2239704210497738,8.8189308691849,1883.281361266345,696.1124275537147,2476.4740937491133,1156.1682218045282,...,0.0,1709.2365309664074,1709.2365309664074,1283.816927120271,601.1365187441382,0.5822900390817174,1885.535735903491,3596.589897616952,284021.7417140534,25763.99999999994
4,6,Extraction of crude petroleum and natural gas,3953.867433730989,15.422350984342671,335.36053414972594,478.4191738875975,124916.07755764909,4677.014428123428,232364.3347631499,2042433.9494759548,...,3026295.083188528,-10372342.198536616,-7346047.115348089,0.0,753710437.9999999,254.40737114156863,753710692.407371,746777817.5631828,1473259.2318791265,1129066721.9999998
