# Add MSOA code column

In [36]:
import pandas as pd
import os

def add_msoa_code(file_path, lookup_path='data/gm_oa_lookup.xlsx', lsoa_column='lsoa_code', output_path=None):
    """
    Add MSOA code column to a dataset based on LSOA code lookup and save as CSV file.

    Parameters:
    -----------
    file_path : str
        Path to the dataset file (CSV or Excel)
    lookup_path : str, default 'gm_oa_lookup.xlsx'
        Path to the lookup table file
    lsoa_column : str, default 'lsoa_code'
        Name of the column containing LSOA codes
    output_path : str, optional
        Path to save the resulting CSV file. If None, a path will be generated.

    Returns:
    --------
    tuple
        (DataFrame with added 'msoa_code' column, path where the file was saved)
    """
    # Load the dataset
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path, low_memory=False)
    elif file_path.endswith(('.xls', '.xlsx')):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please use CSV or Excel.")

    # Load the lookup table and create the mapping
    lookup_df = pd.read_excel(lookup_path, sheet_name='Greater_Manchester')
    lsoa_to_msoa = dict(zip(lookup_df['LSOA11CD'], lookup_df['MSOA11CD']))

    # Add MSOA code column
    df['msoa_code'] = df[lsoa_column].map(lsoa_to_msoa)

    # Generate output path if not provided
    if output_path is None:
        file_base = os.path.splitext(os.path.basename(file_path))[0]
        output_path = f"{file_base}_with_msoa.csv"
    elif not output_path.endswith('.csv'):
        output_path = f"{output_path}.csv"

    # Save as CSV file
    df.to_csv(output_path, index=False)
    print(f"File saved as: {output_path}")

    return df, output_path


def process_multiple_files(file_paths, output_dir=None, **kwargs):
    """
    Process multiple files to add MSOA codes and save as CSV files.

    Parameters:
    -----------
    file_paths : list
        List of paths to dataset files
    output_dir : str, optional
        Directory to save processed files. If None, files are saved in the same location.
    **kwargs :
        Additional arguments to pass to add_msoa_code()

    Returns:
    --------
    dict
        Dictionary of processed DataFrames and their output paths
    """
    results = {}

    for file_path in file_paths:
        try:
            # Generate output path
            file_base = os.path.splitext(os.path.basename(file_path))[0]
            if output_dir:
                os.makedirs(output_dir, exist_ok=True)
                output_path = os.path.join(output_dir, f"{file_base}_with_msoa.csv")
            else:
                output_dir = os.path.dirname(file_path)
                output_path = os.path.join(output_dir, f"{file_base}_with_msoa.csv")

            # Process the file and save as CSV
            df_with_msoa, saved_path = add_msoa_code(
                file_path,
                output_path=output_path,
                **kwargs
            )

            # Add to results
            results[file_path] = {
                'dataframe': df_with_msoa,
                'output_path': saved_path
            }

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    return results

In [39]:
files = ['data/preprocessed/water/indices_of_deprivation_cleaned_gm_filtered2.csv']
results = process_multiple_files(files, lookup_path='data/gm_oa_lookup2011.xlsx', output_dir='data/preprocessed/water')

File saved as: data/preprocessed/water/indices_of_deprivation_cleaned_gm_filtered2_with_msoa.csv


In [40]:
df = pd.read_csv('data/preprocessed/water/indices_of_deprivation_cleaned_gm_filtered2_with_msoa.csv')
df.head()

Unnamed: 0,featurecode,datecode,measurement,value,indices_of_deprivation,lsoa_code,msoa_code
0,E01005278,2019,Rank,11281.0,b. Income Deprivation Domain,E01005278,E02001079
1,E01005236,2019,Rank,4565.0,b. Income Deprivation Domain,E01005236,E02001066
2,E01006030,2019,Rank,6317.0,b. Income Deprivation Domain,E01006030,E02001256
3,E01005118,2019,Rank,12595.0,b. Income Deprivation Domain,E01005118,E02001078
4,E01005317,2019,Rank,3409.0,b. Income Deprivation Domain,E01005317,E02001096
