In [2]:
# ============================================================
# SPSV Project - Step 1: Load, Inspect, and Clean 

# Purpose: This script prepares the 5 synthetic SPSV datasets for analysis,
#          visualisation, and machine learning by:
#          1) loading each dataset,
#          2) inspecting data quality issues,
#          3) applying justified cleaning steps,
#          4) saving cleaned outputs for reproducibility.
# ============================================================

import pandas as pd
import numpy as np

In [3]:
# ------------------------------------------------------------
# 1) LOAD DATASETS (each one individually)
# ------------------------------------------------------------

# I load each dataset separately (rather than combining immediately) so I can:
# - understand the structure and meaning of each dataset,
# - identify issues specific to each dataset (missing values, wrong types, duplicates),
# - clean them in a controlled way before integration and dashboard development.

complaints = pd.read_csv("spsv_complaints.csv")
enforcement = pd.read_csv("spsv_enforcement.csv")
inspections = pd.read_csv("spsv_inspections.csv")
licences = pd.read_csv("spsv_licences.csv")
monthly_kpis = pd.read_csv("spsv_monthly_kpis.csv")

datasets = {
    "complaints": complaints,
    "enforcement": enforcement,
    "inspections": inspections,
    "licences": licences,
    "monthly_kpis": monthly_kpis
}

In [12]:
# ------------------------------------------------------------
# 2) DATA INSPECTION (baseline checks before cleaning)
# ------------------------------------------------------------

def quick_profile(df: pd.DataFrame, name: str) -> None:
    """
    Before cleaning, I run a basic inspection to document data quality.
    This supports the preprocessing rubric because it shows:
    - dataset size and columns (scope),
    - data types (e.g., dates stored as strings),
    - missing values (null handling decisions),
    - duplicates (risk of double-counting in analysis),
    - sample records (sanity checking).
    """
    print("\n" + "=" * 80)
    print(f"DATASET: {name}")
    print("=" * 80)

    print(f"Shape (rows, columns): {df.shape}")

    print("\nColumn names:")
    print(list(df.columns))

    print("\nData types and non-null counts:")
    df.info()

    print("\nMissing values per column (top 15 shown):")
    missing = df.isna().sum().sort_values(ascending=False)
    print(missing.head(15))

    duplicates = df.duplicated().sum()
    print(f"\nDuplicate rows found: {duplicates}")

    print("\nFirst 5 rows (sample records):")
    print(df.head())

# ------------------------------------------------------------
# 3) CLEANING FUNCTIONS (applied consistently across datasets)
# ------------------------------------------------------------

def standardise_text_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Many dashboards rely on grouping/aggregating categories (e.g., Region, Complaint Type).
    If categories are inconsistent (e.g., 'Dublin', ' dublin ', 'DUBLIN'), results become unreliable.
    Therefore, I standardise text columns by:
    - trimming whitespace,
    - converting blank/null-like strings to NaN,
    - uppercasing likely categorical columns (to reduce category drift).
    """
    
    df = df.copy()
    object_cols = df.select_dtypes(include=["object"]).columns.tolist()

    for col in object_cols:
        # Trim whitespace to avoid categories that differ only due to spacing.
        df[col] = df[col].astype(str).str.strip()

        # Convert common null-like text to actual missing values (NaN),
        # so missingness is handled consistently later.
        df[col] = df[col].replace(
            {"": np.nan, "NA": np.nan, "N/A": np.nan, "NULL": np.nan, "NONE": np.nan},
            regex=False
        )

        # Heuristic for category columns:
        # If the column has relatively few unique values compared to number of rows,
        # it is likely categorical and benefits from uppercasing.
        
        unique_count = df[col].nunique(dropna=True)
        unique_ratio = unique_count / max(len(df), 1)

        if unique_count > 0 and unique_ratio < 0.2:
            df[col] = df[col].str.upper()

    return df


def detect_date_columns(df: pd.DataFrame) -> list:
    """
    I detect likely date/time columns using column name patterns.
    Converting these columns to datetime is necessary for:
    - time series charts,
    - monthly trend analysis,
    - forecasting (ML component).
    """
    candidates = []
    for c in df.columns:
        c_lower = str(c).lower()
        if any(word in c_lower for word in ["date", "month", "time", "year"]):
            candidates.append(c)
    return candidates


def parse_dates(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """
    Dates are often read as strings from CSV. I convert date-like columns to datetime.
    I use errors='coerce' so invalid dates become NaT rather than stopping execution.
    After conversion, I print how many values could not be parsed, which helps justify cleaning.
    """
    
    df = df.copy()
    date_cols = detect_date_columns(df)

    if date_cols:
        print(f"\n[{name}] Date-like columns detected: {date_cols}")

    for col in date_cols:
        before_missing = df[col].isna().sum()

        df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)

        after_missing = df[col].isna().sum()
        newly_failed = after_missing - before_missing

        print(f"[{name}] Converted '{col}' to datetime. Newly failed parses: {newly_failed}")

    return df


def handle_missing_values(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """
    Missing value handling must be justified rather than applying a single rule.
    My strategy is:
    - For categorical columns: fill missing with 'UNKNOWN' (keeps records for analysis and avoids dropping too much).
    - For numeric columns: fill missing with the median (more robust than mean if outliers exist).
    - For critical datetime columns: drop rows with missing dates because time-based analysis cannot use them.
    This approach balances data retention with analytical validity.
    """
    df = df.copy()

    # Fill missing text/categorical values.
    object_cols = df.select_dtypes(include=["object"]).columns.tolist()
    for col in object_cols:
        df[col] = df[col].fillna("UNKNOWN")

    # Fill missing numeric values with the column median.
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())

    # Drop rows where datetime columns are missing (if datetime conversion occurred).
    date_cols = detect_date_columns(df)
    for col in date_cols:
        if col in df.columns and pd.api.types.is_datetime64_any_dtype(df[col]):
            before = len(df)
            df = df.dropna(subset=[col])
            after = len(df)
            dropped = before - after
            if dropped > 0:
                print(f"[{name}] Dropped {dropped} rows due to missing '{col}' (required for time-based analysis).")

    return df


def remove_duplicates(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """
    Duplicate rows can inflate totals and distort trends (e.g., double-counting complaints).
    Therefore, I remove exact duplicate rows as a standard quality step.
    """
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)

    if before != after:
        print(f"[{name}] Removed {before - after} duplicate rows.")

    return df


def clean_dataset(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """
    This function applies the cleaning pipeline consistently across datasets.
    Consistency is important for integration and dashboard filtering later.
    Cleaning steps:
    1) standardise text categories,
    2) parse date columns,
    3) handle missing values with justified rules,
    4) remove duplicates,
    5) report final dataset shape and remaining missing values.
    """
    print("\n" + "-" * 80)
    print(f"START CLEANING: {name}")
    print("-" * 80)

    df = standardise_text_columns(df)
    df = parse_dates(df, name)
    df = handle_missing_values(df, name)
    df = remove_duplicates(df, name)

    print(f"[{name}] Cleaning complete. Final shape: {df.shape}")
    print(f"[{name}] Remaining missing values (top 10):")
    print(df.isna().sum().sort_values(ascending=False).head(10))

    return df


In [13]:
# ------------------------------------------------------------
# 4) INSPECT DATASETS BEFORE CLEANING
# ------------------------------------------------------------
# This documents baseline data quality so that cleaning decisions are transparent.

for name, df in datasets.items():
    quick_profile(df, name)

# ------------------------------------------------------------
# 5) CLEAN EACH DATASET
# ------------------------------------------------------------

# I clean each dataset separately, because each has different columns and potential issues.
complaints_clean = clean_dataset(complaints, "complaints")
enforcement_clean = clean_dataset(enforcement, "enforcement")
inspections_clean = clean_dataset(inspections, "inspections")
licences_clean = clean_dataset(licences, "licences")
monthly_kpis_clean = clean_dataset(monthly_kpis, "monthly_kpis")

cleaned_datasets = {
    "complaints_clean": complaints_clean,
    "enforcement_clean": enforcement_clean,
    "inspections_clean": inspections_clean,
    "licences_clean": licences_clean,
    "monthly_kpis_clean": monthly_kpis_clean
}



DATASET: complaints
Shape (rows, columns): (15000, 7)

Column names:
['Complaint_ID', 'Licence_ID', 'Complaint_Type', 'Date_Received', 'Resolved', 'Days_To_Resolution', 'Escalated']

Data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Complaint_ID        15000 non-null  object 
 1   Licence_ID          15000 non-null  object 
 2   Complaint_Type      15000 non-null  object 
 3   Date_Received       15000 non-null  object 
 4   Resolved            15000 non-null  object 
 5   Days_To_Resolution  10619 non-null  float64
 6   Escalated           15000 non-null  object 
dtypes: float64(1), object(6)
memory usage: 820.4+ KB

Missing values per column (top 15 shown):
Days_To_Resolution    4381
Licence_ID               0
Complaint_ID             0
Complaint_Type           0
Date_Received            0
Res

  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)



[inspections] Date-like columns detected: ['Inspection_Date']
[inspections] Converted 'Inspection_Date' to datetime. Newly failed parses: 0
[inspections] Cleaning complete. Final shape: (20000, 7)
[inspections] Remaining missing values (top 10):
Inspection_ID         0
Licence_ID            0
Inspection_Date       0
Inspection_Type       0
Outcome               0
Breach_Category       0
Follow_Up_Required    0
dtype: int64

--------------------------------------------------------------------------------
START CLEANING: licences
--------------------------------------------------------------------------------

[licences] Date-like columns detected: ['Issue_Date', 'Expiry_Date', 'Vehicle_Plate_Year', 'Driver_Experience_Years']
[licences] Converted 'Issue_Date' to datetime. Newly failed parses: 0
[licences] Converted 'Expiry_Date' to datetime. Newly failed parses: 0
[licences] Converted 'Vehicle_Plate_Year' to datetime. Newly failed parses: 0
[licences] Converted 'Driver_Experience_Years'

  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)
  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True)


In [14]:
# ------------------------------------------------------------
# 6) QUICK VERIFICATION AFTER CLEANING
# ------------------------------------------------------------

# This verifies that datasets are ready for the next steps:
# - Exploratory Data Analysis (EDA),
# - dashboard visualisations,
# - predictive modelling (machine learning).
for name, df in cleaned_datasets.items():
    print("\n" + "=" * 80)
    print(f"AFTER CLEANING CHECK: {name}")
    print("=" * 80)
    print(f"Shape: {df.shape}")
    print("Missing values (top 10):")
    print(df.isna().sum().sort_values(ascending=False).head(10))
    print("First 3 rows:")
    print(df.head(3))


AFTER CLEANING CHECK: complaints_clean
Shape: (15000, 7)
Missing values (top 10):
Complaint_ID          0
Licence_ID            0
Complaint_Type        0
Date_Received         0
Resolved              0
Days_To_Resolution    0
Escalated             0
dtype: int64
First 3 rows:
  Complaint_ID  Licence_ID   Complaint_Type Date_Received Resolved  \
0   CMP_000001  SPSV_02362  REFUSAL OF HIRE    2025-05-12      YES   
1   CMP_000002  SPSV_04709     OVERCHARGING    2022-05-25       NO   
2   CMP_000003  SPSV_01411            OTHER    2024-07-26       NO   

   Days_To_Resolution Escalated  
0                17.0        NO  
1                14.0        NO  
2                14.0        NO  

AFTER CLEANING CHECK: enforcement_clean
Shape: (4000, 5)
Missing values (top 10):
Enforcement_ID    0
Licence_ID        0
Action_Type       0
Action_Date       0
Outcome           0
dtype: int64
First 3 rows:
  Enforcement_ID  Licence_ID Action_Type Action_Date   Outcome
0     ENF_000001  SPSV_01329  SU

In [16]:
# ------------------------------------------------------------
# 7) SAVE CLEANED DATASETS
# ------------------------------------------------------------

# Saving cleaned outputs supports reproducibility and makes later dashboard/ML steps cleaner.
# The cleaned files can be tracked in GitHub and re-used without repeating preprocessing each run.
complaints_clean.to_csv("spsv_complaints_clean.csv", index=False)
enforcement_clean.to_csv("spsv_enforcement_clean.csv", index=False)
inspections_clean.to_csv("spsv_inspections_clean.csv", index=False)
licences_clean.to_csv("spsv_licences_clean.csv", index=False)
monthly_kpis_clean.to_csv("spsv_monthly_kpis_clean.csv", index=False)

print("\nSaved cleaned datasets: spsv_*_clean.csv.")


Saved cleaned datasets: spsv_*_clean.csv.
