# Sprint 1: Foundational Setup

## Task 1 (CIR-8): Load and Validate all Datasets

In [None]:
import pandas as pd
import numpy as np
import os
import logging

# Plots
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("logs/CIR-8_data_logs.log"),
        logging.StreamHandler()
    ]
)

In [None]:
# CSVs Directory 
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

In [None]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Check Datatypes...")
for name, df in dataframes.items():
    logging.info(f"{name} types:\n{df.dtypes.value_counts()}")
logging.info("Check Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

In [None]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Calculate missing values...")
for name, df in dataframes.items():
    missing_total = df.isnull().sum().sum()
    missing_cols = df.columns[df.isnull().any()].tolist()
    logging.info(f"{name}: {missing_total} missing values across {len(missing_cols)} columns")
logging.info("Missing values calculation complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

In [None]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Calculate unique values.")
for name, df in dataframes.items():
    if 'y_' in name:
        unique_vals = df.nunique()
        logging.info(f"{name} unique target values: {unique_vals.to_dict()}")

logging.info("Unique values calculation complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

In [None]:
summary = []
for name, df in dataframes.items():
    missing_values = df.isnull().sum().sum()
    missing_cols = df.isnull().any().sum()
    total_cells = df.shape[0] * df.shape[1]
    summary.append({
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Values': missing_values,
        'Accross Missing Columns': missing_cols,
        'Missing %': 100 * missing_values / total_cells
    })

summary_df = pd.DataFrame(summary)
# Save to file
summary_df.to_csv("CSV/exports/01_dataset_summary.csv", index=False)

In [None]:
percent = 7

# Dictionary of datasets for iteration
datasets = {
    "o1_X_train": o1_X_train,
    "o2_X_train": o2_X_train,
    "o3_X_train": o3_X_train,
    "o4_X_train": o4_X_train
}

# Loop through each dataset
for name, df in datasets.items():
    missing_percentage_per_row = df.isnull().mean(axis=1) * 100
    missing_rows = (missing_percentage_per_row <= percent).sum()
    total_rows, total_columns = df.shape
    percent_between = (missing_rows * 100) / total_rows

    print(f"\nDataset: {name}")
    print(f"Total Rows: {total_rows}, Total Columns: {total_columns}")
    print(f"Number of rows with missing values up to {percent}%: {missing_rows}")
    print(f"The percentage between total and missing values sets is {percent_between:.2f}%")

## Task 2 (CIR-9): Analyze Missingness (per row/column)

In [None]:
missingness_summary = []

for name, df in dataframes.items():
    if not isinstance(df, pd.DataFrame):
        continue
    
    total_cells = df.shape[0] * df.shape[1]
    total_missing = df.isnull().sum().sum()
    
    col_missing = df.isnull().mean() * 100
    row_missing = df.isnull().mean(axis=1) * 100
    
    summary = {
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Cells': total_missing,
        'Total Missing %': round(100 * total_missing / total_cells, 2),
        'Columns with Missing (%)': (col_missing > 0).sum(),
        'Max % Missing in Row': round(row_missing.max(), 2),
        'Mean % Missing in Row': round(row_missing.mean(), 2),
        'Min % Missing in Row': round(row_missing.min(), 2)
    }
    missingness_summary.append(summary)

# Create DataFrame for summary
missingness_df = pd.DataFrame(missingness_summary)

# Sort by most missing
missingness_df.sort_values("Total Missing %", ascending=False, inplace=True)

# Save to file
missingness_df.to_csv("CSV/exports/02_missingness_summary.csv", index=False)

# Show top rows, I have leave out the y_ files which are labels
display(missingness_df.head(16))

## Task 3 (CIR-10): Visualize Missingness

In [None]:
# Create output folder
output_dir = "figures/task3_missingness"
os.makedirs(output_dir, exist_ok=True)

# Functions for plotting
def plot_missing_heatmap(df, title, max_rows=200, save_path=None, suffix_filter='(Min)'):
    filtered_cols = [col for col in df.columns if col.endswith(suffix_filter)]
    df_filtered = df[filtered_cols].head(max_rows)
    
    plt.figure(figsize=(16, 6))
    sns.heatmap(df_filtered.isnull(), cbar=False, yticklabels=False)
    plt.title(f"Missing Data Heatmap ({suffix_filter}) Rows: {title}")
    plt.xlabel("Filtered Features")
    plt.ylabel("Rows")
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
    plt.close()

def plot_column_missing_bar(df, title, save_path=None, top_n=60, suffix_filter='(Min)'):
    filtered_cols = [col for col in df.columns if col.endswith(suffix_filter)]
    missing_perc = df[filtered_cols].isnull().mean() * 100
    missing_perc = missing_perc[missing_perc > 0].sort_values(ascending=False).head(top_n)

    plt.figure(figsize=(10, 0.4 * len(missing_perc)))
    missing_perc.plot(kind='barh')
    plt.title(f"Top {len(missing_perc)} ({suffix_filter}) Missing: {title}")
    plt.xlabel("Percentage Missing")
    plt.ylabel("Feature")
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
    plt.close()


# Filter only X datasets from the summary
top_datasets = missingness_df[
    (~missingness_df['Dataset'].str.startswith('y_')) &
    (missingness_df['Dataset'].isin(dataframes.keys()))
].head(16)['Dataset'].tolist()

# Generate plots
for dataset_name in top_datasets:
    df = dataframes.get(dataset_name)
    if df is not None:
        print(f"Generating plots for: {dataset_name}")
        heatmap_path = os.path.join(output_dir, f"{dataset_name}_heatmap.png")
        barplot_path = os.path.join(output_dir, f"{dataset_name}_barplot.png")
        
        plot_missing_heatmap(df, dataset_name, save_path=heatmap_path)
        plot_column_missing_bar(df, dataset_name, save_path=barplot_path)
    else:
        print(f"Dataset not found in dataframes: {dataset_name}")