# MSOA datasets preprocessing

In [16]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

def analyze_msoa_dataset(file_path, msoa_column_name, gm_msoa_lookup_path=None, output_dir="data/preprocessed"):
    """
    Process a dataset that contains MSOA codes, filter for Greater Manchester MSOAs,
    analyze missing values, and generate visualizations for reporting.

    Parameters:
    file_path (str): Path to the dataset file (CSV, Excel, etc.)
    msoa_column_name (str): Name of the column containing MSOA codes
    gm_msoa_lookup_path (str, optional): Path to Greater Manchester MSOA lookup file
    output_dir (str): Directory to save output files and visualizations

    Returns:
    pd.DataFrame: Cleaned dataset filtered to Greater Manchester MSOAs
    dict: Information about output files and visualizations
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Generate a timestamp for unique filenames
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Extract filename without extension for reporting
    base_filename = os.path.basename(file_path).split('.')[0]

    print(f"Processing dataset: {file_path}")

    # Determine file type and load accordingly
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path, low_memory=False)
    elif file_path.endswith(('.xls', '.xlsx')):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please use CSV or Excel.")

    print(f"Original dataset shape: {df.shape}")

    # Save a summary of the original dataset
    with open(f"{output_dir}/{base_filename}_{timestamp}_summary.txt", "w") as f:
        f.write(f"Dataset Analysis: {file_path}\n")
        f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Original dataset shape: {df.shape}\n\n")
        f.write("Column Names:\n")
        for col in df.columns:
            f.write(f"- {col}\n")
        f.write("\n")

    # Basic data cleaning
    # 1. Remove duplicate rows
    df_clean = df.drop_duplicates()
    duplicates_removed = len(df) - len(df_clean)
    print(f"Removed {duplicates_removed} duplicate rows. New shape: {df_clean.shape}")

    # 2. Standardize column names
    original_columns = df_clean.columns.tolist()
    df_clean.columns = [col.lower().replace(' ', '_') for col in df_clean.columns]
    msoa_column_name = msoa_column_name.lower().replace(' ', '_')

    # 3. Handle the MSOA column
    # Make sure MSOA column exists
    if msoa_column_name not in df_clean.columns:
        raise ValueError(f"MSOA column '{msoa_column_name}' not found in dataset. Available columns: {df_clean.columns.tolist()}")

    # Standardize MSOA codes (ensure proper format E02######)
    df_clean[msoa_column_name] = df_clean[msoa_column_name].astype(str)

    # Clean MSOA codes (remove spaces, ensure proper format)
    df_clean[msoa_column_name] = df_clean[msoa_column_name].str.strip()

    # Check for and fix common MSOA format issues
    # Example: Add 'E02' prefix if it's just numeric and 6 digits
    def standardize_msoa(msoa_code):
        # Skip NaN values
        if msoa_code.lower() in ['nan', 'none', '']:
            return np.nan

        # Remove any spaces or special characters
        msoa_code = re.sub(r'[^a-zA-Z0-9]', '', msoa_code)

        # If it's a numeric string of exactly 6 digits, add 'E02' prefix
        if msoa_code.isdigit() and len(msoa_code) == 6:
            return 'E02' + msoa_code

        # If it already has the proper format (E02 followed by 6 digits)
        if re.match(r'^E02\d{6}$', msoa_code):
            return msoa_code

        # Otherwise, return the original code but flag it
        if msoa_code != '':
            print(f"Potential incorrect MSOA code: {msoa_code}")
        return msoa_code

    df_clean[msoa_column_name] = df_clean[msoa_column_name].apply(standardize_msoa)

    # Count standardized vs non-standardized MSOA codes
    standardized_msoa_count = df_clean[msoa_column_name].apply(
        lambda x: bool(isinstance(x, str) and re.match(r'^E02\d{6}$', x))
    ).sum()

    print(f"Standardized MSOA codes: {standardized_msoa_count}/{len(df_clean)} ({standardized_msoa_count/len(df_clean):.2%})")

    # 4. Create a standardized MSOA column if original needs to be preserved
    df_clean['msoa_code'] = df_clean[msoa_column_name]

    # 5. Load Greater Manchester MSOA lookup if provided
    gm_msoa_codes = set()
    if gm_msoa_lookup_path:
        try:
            if gm_msoa_lookup_path.endswith('.csv'):
                gm_lookup = pd.read_csv(gm_msoa_lookup_path)
            elif gm_msoa_lookup_path.endswith(('.xls', '.xlsx')):
                gm_lookup = pd.read_excel(gm_msoa_lookup_path, sheet_name="Greater_Manchester")
            else:
                raise ValueError("Unsupported lookup file format. Please use CSV or Excel.")

            # Extract MSOA codes for Greater Manchester
            msoa_col = [col for col in gm_lookup.columns if "MSOA" in col and "CD" in col][0]
            gm_msoa_codes = set(gm_lookup[msoa_col].unique())
            print(f"Loaded {len(gm_msoa_codes)} unique MSOA codes for Greater Manchester from lookup file")
        except Exception as e:
            warnings.warn(f"Error loading GM MSOA lookup file: {e}. Will not filter for GM MSOAs.")

    # 6. Filter for Greater Manchester MSOAs if lookup provided
    pre_filter_count = len(df_clean)
    if gm_msoa_codes:
        df_clean = df_clean[df_clean['msoa_code'].isin(gm_msoa_codes)]
        print(f"Filtered for Greater Manchester MSOAs: {len(df_clean)}/{pre_filter_count} records retained ({len(df_clean)/pre_filter_count:.2%})")

    # 7. Analyze missing values across all columns
    missing_values = df_clean.isna().sum()
    missing_percentage = (missing_values / len(df_clean)) * 100
    missing_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage': missing_percentage.values
    })
    missing_df = missing_df.sort_values('Missing Values', ascending=False).reset_index(drop=True)

    # Save missing values report
    missing_df.to_csv(f"{output_dir}/{base_filename}_{timestamp}_missing_values.csv", index=False)

    print("\nMissing Values Analysis:")
    print(missing_df)

    # 8. Generate visualizations for the report

    # 8.1 Missing values heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_clean.isna(), cbar=False, yticklabels=False, cmap='viridis')
    plt.title('Missing Values Heatmap', fontsize=16)
    plt.xlabel('Columns', fontsize=12)
    plt.ylabel('Records', fontsize=12)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{base_filename}_{timestamp}_missing_heatmap.png", dpi=300)

    # 8.2 Bar chart of missing values by column
    plt.figure(figsize=(14, 8))
    top_missing = missing_df[missing_df['Missing Values'] > 0].head(20)  # Top 20 columns with missing values

    if not top_missing.empty:
        sns.barplot(x='Percentage', y='Column', data=top_missing)
        plt.title('Percentage of Missing Values by Column', fontsize=16)
        plt.xlabel('Missing Values (%)', fontsize=12)
        plt.ylabel('Column', fontsize=12)
        plt.tight_layout()
        plt.savefig(f"{output_dir}/{base_filename}_{timestamp}_missing_barchart.png", dpi=300)
    else:
        print("No missing values to plot")

    # 8.3 MSOA code quality visualization
    plt.figure(figsize=(10, 6))
    # After filtering for GM MSOAs
    # Update this line in your analyze_msoa_dataset function
    msoa_quality = pd.Series({
        'Valid Format': min(standardized_msoa_count, len(df_clean)),
        'Invalid/Missing': max(0, len(df_clean) - min(standardized_msoa_count, len(df_clean)))
    })
    msoa_quality.plot(kind='pie', autopct='%1.1f%%', colors=['#66b3ff', '#ff9999'])
    plt.title('MSOA Code Quality', fontsize=16)
    plt.ylabel('')  # Hide the ylabel
    plt.tight_layout()
    plt.savefig(f"{output_dir}/{base_filename}_{timestamp}_msoa_quality.png", dpi=300)

    # 8.4 If GM filtering was applied, add a visualization of GM vs non-GM records
    if gm_msoa_codes:
        plt.figure(figsize=(10, 6))
        gm_filter = pd.Series({
            'Greater Manchester': len(df_clean),
            'Outside GM (filtered out)': pre_filter_count - len(df_clean)
        })
        gm_filter.plot(kind='pie', autopct='%1.1f%%', colors=['#66b3ff', '#ff9999'])
        plt.title('Greater Manchester MSOA Filtering', fontsize=16)
        plt.ylabel('')  # Hide the ylabel
        plt.tight_layout()
        plt.savefig(f"{output_dir}/{base_filename}_{timestamp}_gm_filtering.png", dpi=300)

    # 9. Data type analysis
    dtypes_dict = {str(col): str(dtype) for col, dtype in zip(df_clean.columns, df_clean.dtypes)}
    dtypes_df = pd.DataFrame({
        'Column': list(dtypes_dict.keys()),
        'Data Type': list(dtypes_dict.values())
    })
    dtypes_df.to_csv(f"{output_dir}/{base_filename}_{timestamp}_data_types.csv", index=False)

    # 10. Save the cleaned dataset
    cleaned_file_path = f"{output_dir}/{base_filename}_cleaned.csv"
    df_clean.to_csv(cleaned_file_path, index=False)
    print(f"\nCleaned dataset saved to: {cleaned_file_path}")

    # 11. Generate a comprehensive report
    with open(f"{output_dir}/{base_filename}_{timestamp}_report.txt", "w") as f:
        f.write(f"MSOA Dataset Analysis Report\n")
        f.write(f"===========================\n\n")
        f.write(f"Dataset: {file_path}\n")
        f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        f.write(f"Dataset Statistics:\n")
        f.write(f"------------------\n")
        f.write(f"Original records: {len(df)}\n")
        f.write(f"Cleaned records: {len(df_clean)}\n")
        f.write(f"Duplicates removed: {duplicates_removed}\n")
        f.write(f"Total columns: {len(df_clean.columns)}\n\n")

        f.write(f"MSOA Code Analysis:\n")
        f.write(f"-----------------\n")
        f.write(f"MSOA column: {msoa_column_name}\n")
        f.write(f"Records with standardized MSOA codes: {standardized_msoa_count} ({standardized_msoa_count/len(df_clean):.2%})\n")
        f.write(f"Records with invalid or missing MSOA codes: {len(df_clean) - standardized_msoa_count} ({(len(df_clean) - standardized_msoa_count)/len(df_clean):.2%})\n\n")

        if gm_msoa_codes:
            f.write(f"Greater Manchester Filtering:\n")
            f.write(f"--------------------------\n")
            f.write(f"Records before GM filtering: {pre_filter_count}\n")
            f.write(f"Records after GM filtering: {len(df_clean)}\n")
            f.write(f"Records outside GM (filtered out): {pre_filter_count - len(df_clean)}\n\n")

        f.write(f"Missing Values Summary:\n")
        f.write(f"---------------------\n")
        f.write(f"Columns with no missing values: {sum(missing_values == 0)}\n")
        f.write(f"Columns with missing values: {sum(missing_values > 0)}\n\n")

        if sum(missing_values > 0) > 0:
            f.write("Top 10 columns with most missing values:\n")
            for i, row in missing_df[missing_df['Missing Values'] > 0].head(10).iterrows():
                f.write(f"- {row['Column']}: {row['Missing Values']} values ({row['Percentage']:.2f}%)\n")

        f.write("\nGenerated Visualizations:\n")
        f.write(f"------------------------\n")
        f.write(f"1. Missing Values Heatmap: {base_filename}_{timestamp}_missing_heatmap.png\n")
        if not top_missing.empty:
            f.write(f"2. Missing Values Bar Chart: {base_filename}_{timestamp}_missing_barchart.png\n")
        f.write(f"3. MSOA Code Quality Pie Chart: {base_filename}_{timestamp}_msoa_quality.png\n")
        if gm_msoa_codes:
            f.write(f"4. GM Filtering Pie Chart: {base_filename}_{timestamp}_gm_filtering.png\n")

        f.write("\nNext Steps Recommendation:\n")
        f.write(f"------------------------\n")
        if (len(df_clean) - standardized_msoa_count)/len(df_clean) > 0.05:
            f.write("- High priority: Address the invalid or missing MSOA codes (>5% of dataset)\n")

        if missing_df['Percentage'].max() > 10:
            f.write("- Consider imputation strategies for columns with >10% missing values\n")

        f.write("- Review data types and ensure they are appropriate for analysis\n")
        f.write("- Consider spatial validation with Greater Manchester MSOA boundaries\n")

    print(f"\nAnalysis report saved to: {output_dir}/{base_filename}_{timestamp}_report.txt")

    # Return the cleaned dataframe and output file information
    output_files = {
        'cleaned_file': cleaned_file_path,
        'report_file': f"{output_dir}/{base_filename}_{timestamp}_report.txt",
        'missing_heatmap': f"{output_dir}/{base_filename}_{timestamp}_missing_heatmap.png",
        'msoa_quality': f"{output_dir}/{base_filename}_{timestamp}_msoa_quality.png"
    }

    if gm_msoa_codes:
        output_files['gm_filtering'] = f"{output_dir}/{base_filename}_{timestamp}_gm_filtering.png"

    return df_clean, output_files

In [24]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

def analyze_msoa_dataset(file_path, msoa_column_name, gm_msoa_lookup_path=None, output_dir="data/preprocessed"):
    """
    Process a dataset that contains MSOA codes, filter for Greater Manchester MSOAs,
    analyze missing values, and generate visualizations for reporting.

    Parameters:
    file_path (str): Path to the dataset file (CSV, Excel, etc.)
    msoa_column_name (str): Name of the column containing MSOA codes
    gm_msoa_lookup_path (str, optional): Path to Greater Manchester MSOA lookup file
    output_dir (str): Directory to save output files and visualizations

    Returns:
    pd.DataFrame: Cleaned dataset filtered to Greater Manchester MSOAs
    dict: Information about output files and visualizations
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Generate a timestamp for unique filenames
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Extract filename without extension for reporting
    base_filename = os.path.basename(file_path).split('.')[0]

    print(f"Processing dataset: {file_path}")

    # Determine file type and load accordingly
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path, low_memory=False)
        elif file_path.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format. Please use CSV or Excel.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None, {"error": f"Failed to load dataset: {str(e)}"}

    print(f"Original dataset shape: {df.shape}")

    # Check if dataset is empty
    if df.empty:
        print("Warning: Dataset is empty")
        return df, {"warning": "Dataset is empty"}

    # Save a summary of the original dataset
    with open(f"{output_dir}/{base_filename}_{timestamp}_summary.txt", "w") as f:
        f.write(f"Dataset Analysis: {file_path}\n")
        f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Original dataset shape: {df.shape}\n\n")
        f.write("Column Names:\n")
        for col in df.columns:
            f.write(f"- {col}\n")
        f.write("\n")

    # Basic data cleaning
    # 1. Remove duplicate rows
    df_clean = df.drop_duplicates()
    duplicates_removed = len(df) - len(df_clean)
    print(f"Removed {duplicates_removed} duplicate rows. New shape: {df_clean.shape}")

    # 2. Standardize column names
    original_columns = df_clean.columns.tolist()
    df_clean.columns = [col.lower().replace(' ', '_') for col in df_clean.columns]
    msoa_column_name = msoa_column_name.lower().replace(' ', '_')

    # 3. Handle the MSOA column
    # Make sure MSOA column exists
    if msoa_column_name not in df_clean.columns:
        error_msg = f"MSOA column '{msoa_column_name}' not found in dataset. Available columns: {df_clean.columns.tolist()}"
        print(error_msg)
        return df_clean, {"error": error_msg}

    # Standardize MSOA codes (ensure proper format E02######)
    df_clean[msoa_column_name] = df_clean[msoa_column_name].astype(str)

    # Print sample of original MSOA codes for debugging
    print("\nSample of original MSOA codes:")
    print(df_clean[msoa_column_name].head(10).tolist())

    # Clean MSOA codes (remove spaces, ensure proper format)
    df_clean[msoa_column_name] = df_clean[msoa_column_name].str.strip()

    # Check for and fix common MSOA format issues
    def standardize_msoa(msoa_code):
        # Skip NaN values
        if pd.isna(msoa_code) or msoa_code.lower() in ['nan', 'none', '']:
            return np.nan

        # Remove any spaces or special characters
        msoa_code = re.sub(r'[^a-zA-Z0-9]', '', msoa_code)

        # If it's a numeric string of exactly 6 digits, add 'E02' prefix
        if msoa_code.isdigit() and len(msoa_code) == 6:
            return 'E02' + msoa_code

        # If it has 9 digits and starts with 'E02', it's likely already correct
        if re.match(r'^E02\d{6}$', msoa_code):
            return msoa_code

        # If it's just the 2-3 digit part of an MSOA code
        if msoa_code.isdigit() and len(msoa_code) in [2, 3]:
            return f'E02001{msoa_code.zfill(3)}'

        # If it's another format used for MSOAs, log it but don't modify
        if msoa_code != '':
            print(f"Potential non-standard MSOA code format: {msoa_code}")
        return msoa_code

    df_clean[msoa_column_name] = df_clean[msoa_column_name].apply(standardize_msoa)

    # Print sample of standardized MSOA codes for debugging
    print("\nSample of standardized MSOA codes:")
    print(df_clean[msoa_column_name].head(10).tolist())

    # Count standardized vs non-standardized MSOA codes
    standardized_msoa_count = df_clean[msoa_column_name].apply(
        lambda x: bool(isinstance(x, str) and re.match(r'^E02\d{6}$', x))
    ).sum()

    print(f"Standardized MSOA codes: {standardized_msoa_count}/{len(df_clean)} ({standardized_msoa_count/len(df_clean):.2%})")

    # 4. Create a standardized MSOA column if original needs to be preserved
    df_clean['msoa_code'] = df_clean[msoa_column_name]

    # 5. Load Greater Manchester MSOA lookup if provided
    gm_msoa_codes = set()
    if gm_msoa_lookup_path:
        try:
            if gm_msoa_lookup_path.endswith('.csv'):
                gm_lookup = pd.read_csv(gm_msoa_lookup_path)
            elif gm_msoa_lookup_path.endswith(('.xls', '.xlsx')):
                # Try different sheet names in case there's an issue
                try:
                    gm_lookup = pd.read_excel(gm_msoa_lookup_path, sheet_name="Greater_Manchester")
                except Exception:
                    # Try the first sheet if "Greater_Manchester" sheet doesn't exist
                    gm_lookup = pd.read_excel(gm_msoa_lookup_path)
            else:
                raise ValueError("Unsupported lookup file format. Please use CSV or Excel.")

            print(f"\nLoaded GM lookup file with shape: {gm_lookup.shape}")
            print(f"Columns in lookup file: {gm_lookup.columns.tolist()}")

            # Try to identify the MSOA code column - be more flexible in column name matching
            msoa_col_candidates = [
                col for col in gm_lookup.columns if
                ("MSOA" in col.upper() and "CD" in col.upper()) or
                ("MSOA" in col.upper() and "CODE" in col.upper()) or
                col.upper() == "MSOA11CD"
            ]

            if not msoa_col_candidates:
                # If no obvious MSOA column, look for any column that might contain MSOA codes
                for col in gm_lookup.columns:
                    if gm_lookup[col].dtype == 'object':  # String column
                        # Check if values match E02###### format
                        if gm_lookup[col].astype(str).str.contains(r'E02\d{6}').any():
                            msoa_col_candidates = [col]
                            print(f"Found potential MSOA column: {col}")
                            break

            if msoa_col_candidates:
                msoa_col = msoa_col_candidates[0]
                print(f"Using column '{msoa_col}' for GM MSOA codes")

                # Print sample values for debugging
                print(f"Sample values from '{msoa_col}':")
                print(gm_lookup[msoa_col].head(5).tolist())

                # Standardize GM MSOA codes using the same function
                gm_lookup[msoa_col] = gm_lookup[msoa_col].astype(str).apply(standardize_msoa)
                gm_msoa_codes = set(gm_lookup[msoa_col].dropna().unique())

                print(f"Loaded {len(gm_msoa_codes)} unique MSOA codes for Greater Manchester from lookup file")
                print(f"Sample standardized GM MSOA codes:")
                print(list(gm_msoa_codes)[:5])
            else:
                raise ValueError(f"Could not identify MSOA code column in lookup file")

        except Exception as e:
            warnings.warn(f"Error loading GM MSOA lookup file: {e}. Will not filter for GM MSOAs.")
            print(f"Error details: {str(e)}")

    # 6. Filter for Greater Manchester MSOAs if lookup provided
    pre_filter_count = len(df_clean)
    if gm_msoa_codes:
        # First check if there would be any matches with the current approach
        matching_msoas = set(df_clean['msoa_code'].dropna().unique()).intersection(gm_msoa_codes)
        print(f"\nMatching MSOA codes between dataset and GM lookup: {len(matching_msoas)}")

        if len(matching_msoas) == 0:
            print("WARNING: No exact matches found between dataset MSOA codes and GM lookup")
            print("Trying alternative matching approaches:")

            # Try partial string matching
            partial_matches = []
            for data_msoa in df_clean['msoa_code'].dropna().unique():
                for gm_msoa in gm_msoa_codes:
                    # Try various matching techniques
                    if (str(data_msoa) in str(gm_msoa) or
                        str(gm_msoa) in str(data_msoa) or
                        str(data_msoa)[-6:] == str(gm_msoa)[-6:]):  # Match last 6 digits
                        partial_matches.append((data_msoa, gm_msoa))
                        break

            if partial_matches:
                print(f"Found {len(partial_matches)} potential partial matches")
                for data_msoa, gm_msoa in partial_matches[:5]:  # Show first 5
                    print(f"  Dataset: {data_msoa} <-> GM Lookup: {gm_msoa}")

                # Create a mapping dictionary
                msoa_mapping = {data_msoa: gm_msoa for data_msoa, gm_msoa in partial_matches}

                # Apply the mapping to create a new column with GM MSOA codes
                df_clean['gm_msoa_code'] = df_clean['msoa_code'].map(msoa_mapping)

                # Filter using the mapped column
                df_filtered = df_clean.dropna(subset=['gm_msoa_code'])

                if len(df_filtered) > 0:
                    print(f"After mapping: {len(df_filtered)}/{pre_filter_count} records retained ({len(df_filtered)/pre_filter_count:.2%})")
                    df_clean = df_filtered
                else:
                    print("Still no matches after mapping. Will keep all records but flag as not GM-specific.")
            else:
                print("No partial matches found. Will keep all records but flag as not GM-specific.")
        else:
            # Use the original exact matching approach
            df_clean = df_clean[df_clean['msoa_code'].isin(gm_msoa_codes)]
            print(f"Filtered for Greater Manchester MSOAs: {len(df_clean)}/{pre_filter_count} records retained ({len(df_clean)/pre_filter_count:.2%})")

    # Handle the case where filtering resulted in an empty DataFrame
    if len(df_clean) == 0:
        print("WARNING: Filtering resulted in an empty dataset")
        print("Reverting to pre-filtered dataset for analysis")
        df_clean = df.copy()  # Revert to original dataset
        df_clean.columns = [col.lower().replace(' ', '_') for col in df_clean.columns]

        # Add a flag to indicate this is not GM-specific
        not_gm_specific = True
    else:
        not_gm_specific = False

    # 7. Analyze missing values across all columns
    missing_values = df_clean.isna().sum()
    missing_percentage = (missing_values / len(df_clean)) * 100
    missing_df = pd.DataFrame({
        'Column': missing_values.index,
        'Missing Values': missing_values.values,
        'Percentage': missing_percentage.values
    })
    missing_df = missing_df.sort_values('Missing Values', ascending=False).reset_index(drop=True)

    # Save missing values report
    missing_df.to_csv(f"{output_dir}/{base_filename}_{timestamp}_missing_values.csv", index=False)

    print("\nMissing Values Analysis:")
    print(missing_df)

    # 8. Generate visualizations only if the dataset is not empty
    if len(df_clean) > 0:
        # 8.1 Missing values heatmap
        plt.figure(figsize=(12, 8))
        # Handle case where dataset is too large for heatmap visualization
        if len(df_clean) > 1000:
            # Sample a subset of rows for visualization
            sample_size = min(1000, len(df_clean))
            sample_df = df_clean.sample(sample_size)
            sns.heatmap(sample_df.isna(), cbar=False, yticklabels=False, cmap='viridis')
            plt.title('Missing Values Heatmap (Sample)', fontsize=16)
        else:
            sns.heatmap(df_clean.isna(), cbar=False, yticklabels=False, cmap='viridis')
            plt.title('Missing Values Heatmap', fontsize=16)

        plt.xlabel('Columns', fontsize=12)
        plt.ylabel('Records', fontsize=12)
        plt.tight_layout()
        plt.savefig(f"{output_dir}/{base_filename}_{timestamp}_missing_heatmap.png", dpi=300)
        plt.close()

        # 8.2 Bar chart of missing values by column
        plt.figure(figsize=(14, 8))
        top_missing = missing_df[missing_df['Missing Values'] > 0].head(20)  # Top 20 columns with missing values

        if not top_missing.empty:
            sns.barplot(x='Percentage', y='Column', data=top_missing)
            plt.title('Percentage of Missing Values by Column', fontsize=16)
            plt.xlabel('Missing Values (%)', fontsize=12)
            plt.ylabel('Column', fontsize=12)
            plt.tight_layout()
            plt.savefig(f"{output_dir}/{base_filename}_{timestamp}_missing_barchart.png", dpi=300)
            plt.close()
        else:
            print("No missing values to plot")

        # 8.3 MSOA code quality visualization
        plt.figure(figsize=(10, 6))
        msoa_quality = pd.Series({
            'Valid Format': min(standardized_msoa_count, len(df_clean)),
            'Invalid/Missing': max(0, len(df_clean) - min(standardized_msoa_count, len(df_clean)))
        })
        if msoa_quality['Invalid/Missing'] > 0 or msoa_quality['Valid Format'] > 0:
            msoa_quality.plot(kind='pie', autopct='%1.1f%%', colors=['#66b3ff', '#ff9999'])
            plt.title('MSOA Code Quality', fontsize=16)
            plt.ylabel('')  # Hide the ylabel
            plt.tight_layout()
            plt.savefig(f"{output_dir}/{base_filename}_{timestamp}_msoa_quality.png", dpi=300)
            plt.close()

        # 8.4 If GM filtering was applied, add a visualization of GM vs non-GM records
        if gm_msoa_codes and pre_filter_count > 0 and not not_gm_specific:
            plt.figure(figsize=(10, 6))
            gm_filter = pd.Series({
                'Greater Manchester': len(df_clean),
                'Outside GM (filtered out)': pre_filter_count - len(df_clean)
            })
            gm_filter.plot(kind='pie', autopct='%1.1f%%', colors=['#66b3ff', '#ff9999'])
            plt.title('Greater Manchester MSOA Filtering', fontsize=16)
            plt.ylabel('')  # Hide the ylabel
            plt.tight_layout()
            plt.savefig(f"{output_dir}/{base_filename}_{timestamp}_gm_filtering.png", dpi=300)
            plt.close()

    # 9. Data type analysis
    dtypes_dict = {str(col): str(dtype) for col, dtype in zip(df_clean.columns, df_clean.dtypes)}
    dtypes_df = pd.DataFrame({
        'Column': list(dtypes_dict.keys()),
        'Data Type': list(dtypes_dict.values())
    })
    dtypes_df.to_csv(f"{output_dir}/{base_filename}_{timestamp}_data_types.csv", index=False)

    # 10. Save the cleaned dataset
    cleaned_file_path = f"{output_dir}/{base_filename}_cleaned.csv"
    df_clean.to_csv(cleaned_file_path, index=False)
    print(f"\nCleaned dataset saved to: {cleaned_file_path}")

    # 11. Generate a comprehensive report
    with open(f"{output_dir}/{base_filename}_{timestamp}_report.txt", "w") as f:
        f.write(f"MSOA Dataset Analysis Report\n")
        f.write(f"===========================\n\n")
        f.write(f"Dataset: {file_path}\n")
        f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        f.write(f"Dataset Statistics:\n")
        f.write(f"------------------\n")
        f.write(f"Original records: {len(df)}\n")
        f.write(f"Cleaned records: {len(df_clean)}\n")
        f.write(f"Duplicates removed: {duplicates_removed}\n")
        f.write(f"Total columns: {len(df_clean.columns)}\n\n")

        f.write(f"MSOA Code Analysis:\n")
        f.write(f"-----------------\n")
        f.write(f"MSOA column: {msoa_column_name}\n")

        # Handle division by zero for percentage calculations
        if len(df_clean) > 0:
            f.write(f"Records with standardized MSOA codes: {standardized_msoa_count} ({standardized_msoa_count/len(df_clean):.2%})\n")
            f.write(f"Records with invalid or missing MSOA codes: {len(df_clean) - standardized_msoa_count} ({(len(df_clean) - standardized_msoa_count)/len(df_clean):.2%})\n\n")
        else:
            f.write(f"Records with standardized MSOA codes: {standardized_msoa_count} (0.00%)\n")
            f.write(f"Records with invalid or missing MSOA codes: {len(df_clean) - standardized_msoa_count} (0.00%)\n\n")

        if gm_msoa_codes:
            f.write(f"Greater Manchester Filtering:\n")
            f.write(f"--------------------------\n")
            f.write(f"Records before GM filtering: {pre_filter_count}\n")
            f.write(f"Records after GM filtering: {len(df_clean)}\n")

            if not_gm_specific:
                f.write(f"NOTE: Dataset does not appear to contain Greater Manchester specific data.\n")
                f.write(f"      Analysis was performed on the full dataset.\n\n")
            else:
                f.write(f"Records outside GM (filtered out): {pre_filter_count - len(df_clean)}\n\n")

        f.write(f"Missing Values Summary:\n")
        f.write(f"---------------------\n")
        f.write(f"Columns with no missing values: {sum(missing_values == 0)}\n")
        f.write(f"Columns with missing values: {sum(missing_values > 0)}\n\n")

        if sum(missing_values > 0) > 0:
            f.write("Top 10 columns with most missing values:\n")
            for i, row in missing_df[missing_df['Missing Values'] > 0].head(10).iterrows():
                f.write(f"- {row['Column']}: {row['Missing Values']} values ({row['Percentage']:.2f}%)\n")

        f.write("\nGenerated Visualizations:\n")
        f.write(f"------------------------\n")
        f.write(f"1. Missing Values Heatmap: {base_filename}_{timestamp}_missing_heatmap.png\n")
        if not top_missing.empty:
            f.write(f"2. Missing Values Bar Chart: {base_filename}_{timestamp}_missing_barchart.png\n")
        if msoa_quality['Invalid/Missing'] > 0 or msoa_quality['Valid Format'] > 0:
            f.write(f"3. MSOA Code Quality Pie Chart: {base_filename}_{timestamp}_msoa_quality.png\n")
        if gm_msoa_codes and pre_filter_count > 0 and not not_gm_specific:
            f.write(f"4. GM Filtering Pie Chart: {base_filename}_{timestamp}_gm_filtering.png\n")

        f.write("\nNext Steps Recommendation:\n")
        f.write(f"------------------------\n")
        if len(df_clean) > 0 and (len(df_clean) - standardized_msoa_count)/len(df_clean) > 0.05:
            f.write("- High priority: Address the invalid or missing MSOA codes (>5% of dataset)\n")

        if missing_df['Percentage'].max() > 10:
            f.write("- Consider imputation strategies for columns with >10% missing values\n")

        f.write("- Review data types and ensure they are appropriate for analysis\n")
        f.write("- Consider spatial validation with Greater Manchester MSOA boundaries\n")

        if not_gm_specific:
            f.write("- IMPORTANT: This dataset does not appear to contain Greater Manchester specific data.\n")
            f.write("  Consider finding alternative data sources or expanding the geographic scope of analysis.\n")

    print(f"\nAnalysis report saved to: {output_dir}/{base_filename}_{timestamp}_report.txt")

    # Return the cleaned dataframe and output file information
    output_files = {
        'cleaned_file': cleaned_file_path,
        'report_file': f"{output_dir}/{base_filename}_{timestamp}_report.txt",
        'missing_heatmap': f"{output_dir}/{base_filename}_{timestamp}_missing_heatmap.png",
    }

    if len(df_clean) > 0:
        if msoa_quality['Invalid/Missing'] > 0 or msoa_quality['Valid Format'] > 0:
            output_files['msoa_quality'] = f"{output_dir}/{base_filename}_{timestamp}_msoa_quality.png"

        if not top_missing.empty:
            output_files['missing_barchart'] = f"{output_dir}/{base_filename}_{timestamp}_missing_barchart.png"

        if gm_msoa_codes and pre_filter_count > 0 and not not_gm_specific:
            output_files['gm_filtering'] = f"{output_dir}/{base_filename}_{timestamp}_gm_filtering.png"

    # Add warning if not GM-specific
    if not_gm_specific:
        output_files['warning'] = "Dataset does not appear to contain Greater Manchester specific data"

    return df_clean, output_files

### company_financial_records

In [28]:
df, outputs = analyze_msoa_dataset(
    file_path="data/rawdata/finance/company_financial_records.xlsx",
    msoa_column_name="msoa_code",
    gm_msoa_lookup_path="data/gm_oa_lookup2011.csv",
    output_dir="data/preprocessed/finance"
)

Processing dataset: data/rawdata/finance/company_financial_records.xlsx
Original dataset shape: (87, 5)
Removed 0 duplicate rows. New shape: (87, 5)

Sample of original MSOA codes:
['E02001045', 'E02001046', 'E02001047', 'E02001048', 'E02001049', 'E02001050', 'E02001051', 'E02001052', 'E02001053', 'E02001055']

Sample of standardized MSOA codes:
['E02001045', 'E02001046', 'E02001047', 'E02001048', 'E02001049', 'E02001050', 'E02001051', 'E02001052', 'E02001053', 'E02001055']
Standardized MSOA codes: 87/87 (100.00%)

Loaded GM lookup file with shape: (8684, 12)
Columns in lookup file: ['OA11CD', 'LAD16CD', 'LAD16NM', 'LSOA11CD', 'LSOA11NM', 'MSOA11CD', 'MSOA11NM', 'LEP17CD1', 'LEP17NM1', 'LEP17CD2', 'LEP17NM2', 'FID']
Using column 'MSOA11CD' for GM MSOA codes
Sample values from 'MSOA11CD':
['E02000988', 'E02000988', 'E02000988', 'E02000988', 'E02000988']
Loaded 346 unique MSOA codes for Greater Manchester from lookup file
Sample standardized GM MSOA codes:
['E02001302', 'E02001123', 'E02

<Figure size 1400x800 with 0 Axes>

In [30]:
df = pd.read_csv("data/preprocessed/finance/company_financial_records_cleaned.csv")
df.head()

Unnamed: 0,msoa_code,count,employment,employees,turnover_(£'000s)
0,E02001045,160,599,563,30797
1,E02001046,195,663,604,41844
2,E02001047,160,689,645,29742
3,E02001048,185,620,587,40969
4,E02001049,195,764,717,41564


In [31]:
print(df.isnull().sum())

msoa_code            0
count                0
employment           0
employees            0
turnover_(£'000s)    0
dtype: int64


In [32]:
df.columns

Index(['msoa_code', 'count', 'employment', 'employees', 'turnover_(£'000s)'], dtype='object')

In [33]:
df.columns = ['msoa_code', 'count', 'employment', 'employees', 'turnover']
df.head()

Unnamed: 0,msoa_code,count,employment,employees,turnover
0,E02001045,160,599,563,30797
1,E02001046,195,663,604,41844
2,E02001047,160,689,645,29742
3,E02001048,185,620,587,40969
4,E02001049,195,764,717,41564


In [35]:
df.to_csv('data/preprocessed/finance/company_financial_records_cleaned2.csv', index=False)

### households_in_poverty_estimates_bhc

In [36]:
df, outputs = analyze_msoa_dataset(
    file_path="data/rawdata/finance/households_in_poverty_estimates_bhc.xls",
    msoa_column_name="msoa_code",
    gm_msoa_lookup_path="data/gm_oa_lookup2011.csv",
    output_dir="data/preprocessed/finance"
)

Processing dataset: data/rawdata/finance/households_in_poverty_estimates_bhc.xls
Original dataset shape: (7202, 9)
Removed 0 duplicate rows. New shape: (7202, 9)

Sample of original MSOA codes:
['E02004297', 'E02004290', 'E02004298', 'E02004299', 'E02004291', 'E02004300', 'E02004292', 'E02004301', 'E02004302', 'E02004303']
Potential non-standard MSOA code format: W02000001
Potential non-standard MSOA code format: W02000002
Potential non-standard MSOA code format: W02000003
Potential non-standard MSOA code format: W02000004
Potential non-standard MSOA code format: W02000005
Potential non-standard MSOA code format: W02000006
Potential non-standard MSOA code format: W02000007
Potential non-standard MSOA code format: W02000008
Potential non-standard MSOA code format: W02000009
Potential non-standard MSOA code format: W02000010
Potential non-standard MSOA code format: W02000011
Potential non-standard MSOA code format: W02000012
Potential non-standard MSOA code format: W02000013
Potential no

  plt.tight_layout()


No missing values to plot

Cleaned dataset saved to: data/preprocessed/finance/households_in_poverty_estimates_bhc_cleaned.csv

Analysis report saved to: data/preprocessed/finance/households_in_poverty_estimates_bhc_20250410_214623_report.txt


<Figure size 1400x800 with 0 Axes>

In [37]:
df = pd.read_csv("data/preprocessed/finance/households_in_poverty_estimates_bhc_cleaned.csv")
df.head()

Unnamed: 0,msoa_code,msoa_name,local_authority_code,local_authority_name,region_code,region_name,percentage_of_households_below_60%_of_the_median_income;_(before_housing_costs),percentage_of_households_below_60%_of_the_median_income;_(before_housing_costs);_95%_confidence_interval_lower_limit,percentage_of_households_below_60%_of_the_median_income;_(before_housing_costs);_95%_confidence_interval_upper_limit
0,E02000984,Bolton 001,E08000001,Bolton,E12000002,North West,10.1,7.9,12.8
1,E02000985,Bolton 002,E08000001,Bolton,E12000002,North West,12.3,9.8,15.3
2,E02000986,Bolton 003,E08000001,Bolton,E12000002,North West,12.9,10.4,15.8
3,E02000987,Bolton 004,E08000001,Bolton,E12000002,North West,18.0,14.7,21.9
4,E02000988,Bolton 005,E08000001,Bolton,E12000002,North West,21.8,18.0,26.2


### households_in_poverty_estimates_ahc

In [38]:
df, outputs = analyze_msoa_dataset(
    file_path="data/rawdata/finance/households_in_poverty_estimates_ahc.xls",
    msoa_column_name="msoa_code",
    gm_msoa_lookup_path="data/gm_oa_lookup2011.csv",
    output_dir="data/preprocessed/finance"
)

Processing dataset: data/rawdata/finance/households_in_poverty_estimates_ahc.xls
Original dataset shape: (7202, 9)
Removed 0 duplicate rows. New shape: (7202, 9)

Sample of original MSOA codes:
['E02004297', 'E02004290', 'E02004298', 'E02004299', 'E02004291', 'E02004300', 'E02004292', 'E02004301', 'E02004302', 'E02004303']
Potential non-standard MSOA code format: W02000001
Potential non-standard MSOA code format: W02000002
Potential non-standard MSOA code format: W02000003
Potential non-standard MSOA code format: W02000004
Potential non-standard MSOA code format: W02000005
Potential non-standard MSOA code format: W02000006
Potential non-standard MSOA code format: W02000007
Potential non-standard MSOA code format: W02000008
Potential non-standard MSOA code format: W02000009
Potential non-standard MSOA code format: W02000010
Potential non-standard MSOA code format: W02000011
Potential non-standard MSOA code format: W02000012
Potential non-standard MSOA code format: W02000013
Potential no

  plt.tight_layout()


No missing values to plot

Cleaned dataset saved to: data/preprocessed/finance/households_in_poverty_estimates_ahc_cleaned.csv

Analysis report saved to: data/preprocessed/finance/households_in_poverty_estimates_ahc_20250410_214719_report.txt


<Figure size 1400x800 with 0 Axes>