In [1]:
# Load packages
import sys
import os
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from scipy.spatial import KDTree

# Add the current directory to PYTHONPATH
sys.path.append(os.getcwd())

In [2]:
class PreprocessorPipeline: 
    def __init__ (self):
        # self.cat_encoder = None
        self.scaler = None
        self.imputers = {}
        self.log = []

    def drop_duplicates(self, df):
        original_count = len(df)
        df.drop_duplicates(inplace=True)
        dropped_count = original_count - len(df)
        self.log.append(f"Dropped {dropped_count} duplicate rows.")

    @staticmethod
    def standardize_postcode(postcode):
        # Standardizes a postcode to the format 'SW1A 1AA'
        if isinstance(postcode, str):
            postcode = postcode.strip().upper()
            match = re.match(r'^([A-Z]{1,2}[0-9][A-Z0-9]?)(\s*?)([0-9][A-Z]{2})$', postcode)
            if match:
                return f"{match.group(1)} {match.group(3)}"
        return postcode

    def merge_datasets(self, df, sector_data=None, station_data=None, district_data=None):
        # Merge additional datasets into the main DataFrame
        
        if sector_data is not None and "sector" in df.columns:
            df = df.merge(sector_data, on="sector", how="left")
            self.log.append("Merged 'sector_data' into the main dataset.")

        if district_data is not None and "postcodeDistrict" in df.columns:
            df = df.merge(district_data, on="postcodeDistrict", how="left")
            self.log.append("Merged 'district_data' into the main dataset.")

        # Add distance to nearest station(Improves geospatial understanding of flood risks by correlating risk with monitored data.)
        if station_data is not None:
            station_coords = station_data[["latitude", "longitude"]].to_numpy()
            postcode_coords = df[["northing", "easting"]].to_numpy()
            distances = cdist(postcode_coords, station_coords, metric="euclidean")
            df["distance_to_station"] = distances.min(axis=1)
            self.log.append("Added 'distance_to_station' feature using station data.")

        return df


    def handle_missing_data(self, df, categorical_columns, numeric_columns, method="median"):
        # Handles missing data using the specified imputation strategy
        
        # Impute categorical columns with the mode
        for col in categorical_columns:
            if col in df.columns:
                mode_imputer = SimpleImputer(strategy="most_frequent")
                df[col] = mode_imputer.fit_transform(df[[col]])
                self.imputers[col] = mode_imputer
                self.log.append(f"Imputed missing values in '{col}' using mode.")

        # Impute numeric columns
        for col in numeric_columns:
            if col in df.columns:
                imputer = SimpleImputer(strategy=method)
                df[col] = imputer.fit_transform(df[[col]])
                self.imputers[col] = imputer
                self.log.append(f"Imputed missing values in '{col}' using {method} strategy.")
        return df


    def scale_numeric_features(self, df, numeric_columns, scaling_type="standard"):
        # Scales numeric features using the specified scaler
        
        if scaling_type == "standard":
            self.scaler = StandardScaler()
        elif scaling_type == "minmax":
            self.scaler = MinMaxScaler()
        elif scaling_type == 'robustscaler':
            self.scaler = RobustScaler()
        else:
            raise ValueError("Unsupported scaling_type. Use 'standard' or 'minmax'.")

        numeric_columns = [col for col in numeric_columns if col in df.columns]
        if numeric_columns:
            df[numeric_columns] = self.scaler.fit_transform(df[numeric_columns])
            self.log.append(f"Scaled numeric columns: {numeric_columns} using {scaling_type} scaling.")
        return df


    def feature_engineering(self, df, sector_data=None, station_data=None):
        # Adds derived features like proximity risk and population density
        
        # Calculate proximity risk
        if "distanceToWatercourse" in df.columns and "elevation" in df.columns:
            df["proximity_risk"] = df["distanceToWatercourse"] / (df["elevation"] + 1)
            self.log.append("Added 'proximity_risk' feature.")

        # Add population density from sector data
        if sector_data is not None and "sector" in df.columns:
            df["population_density"] = sector_data["population"] / sector_data["households"]
            self.log.append("Added 'population_density' feature from sector data.")
        return df    

    def interaction_general(self, df):
        # Initialize label encoders
        label_encoder_soil = LabelEncoder()
        label_encoder_watercourse = LabelEncoder()

        # Encode 'soilType' and 'nearestWatercourse'
        df['soilType_encoded'] = label_encoder_soil.fit_transform(df['soilType'])
        df['nearestWatercourse_encoded'] = label_encoder_watercourse.fit_transform(df['nearestWatercourse'])

        # Calculate bins for elevation and distanceToWatercourse
        percentiles = [0, 0.25, 0.5, 0.75, 1.0]
        bins_elevation = df['elevation'].quantile(percentiles).values
        bins_distance = df['distanceToWatercourse'].quantile(percentiles).values

        # Define bin labels
        bin_labels = ['Low-Mid', 'Mid', 'Mid-High', 'High']

         # Perform binning
        df['elevation_category'] = pd.cut(df['elevation'], bins=bins_elevation, labels=bin_labels, include_lowest=True)
        df['distanceToWatercourse_category'] = pd.cut(df['distanceToWatercourse'], bins=bins_distance, labels=bin_labels, include_lowest=True)
    
        # Combine categorical columns for interaction terms
        df['soilType/Elevation'] = df['soilType_encoded'].astype(str) + '/' + df['elevation_category'].astype(str)
        df['distanceToWatercourse/nearestWatercourse'] = df['distanceToWatercourse_category'].astype(str) + '/' + df['nearestWatercourse_encoded'].astype(str)
  

        # Apply label encoding to interaction columns
        interaction_encoders = {col: LabelEncoder() for col in ['soilType/Elevation', 'distanceToWatercourse/nearestWatercourse' ]}
        for col, encoder in interaction_encoders.items():
            df[col] = encoder.fit_transform(df[col])

        # Drop unnecessary intermediate encoded columns
        df.drop(columns=['soilType_encoded', 'nearestWatercourse_encoded', 'elevation_category', 'distanceToWatercourse_category'], inplace=True)

        return df


    def preprocess(self, df, categorical_columns=None, numeric_columns=None, scaling_type="standard",
                   imputation_method="median", sector_data=None, station_data=None, district_data=None):
        # Executes the complete preprocessing pipeline.
        
        if categorical_columns is None:
            categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
        if numeric_columns is None:
            numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

        # Standardize postcodes
        if "postcode" in df.columns:
            df["postcode"] = df["postcode"].apply(self.standardize_postcode)
            self.log.append("Standardized 'postcode' column.")

        # Merge supporting datasets
        df = self.merge_datasets(df, sector_data, station_data, district_data)

        # Handle missing data
        df = self.handle_missing_data(df, categorical_columns, numeric_columns, method=imputation_method)

        # Encode categorical features
        df = self.encode_categorical_features(df, categorical_columns)

        # Feature engineering
        df = self.feature_engineering(df, sector_data, station_data)

        # Scale numeric features
        df = self.scale_numeric_features(df, numeric_columns, scaling_type=scaling_type)

        return df


    def generate_report(self):
        """Generates a summary of preprocessing actions."""
        return "\n".join(self.log)