In [None]:
import pandas as pd
import os
from google.colab import files

# Define paths
DATA_DIR = "/content"     # Colab default working directory
OUTPUT_DIR = "/content/cleaned"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def clean_dataframe(df):
    # 1. Standardize column names
    df.columns = [c.strip().replace(" ", "_").replace("-", "_") for c in df.columns]

    # 2. Strip whitespace from string/object columns
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype(str).str.strip()

    # 3. Handle duplicates
    df = df.drop_duplicates()

    # 4. Handle missing values
    df = df.fillna({
        col: "Unknown" if df[col].dtype == "object" else 0
        for col in df.columns
    })

    return df

def clean_and_save(filename, date_cols=None):
    df = pd.read_csv(os.path.join(DATA_DIR, filename))
    df = clean_dataframe(df)

    # Format date columns properly
    if date_cols:
        for col in date_cols:
            df[col] = pd.to_datetime(df[col], errors="coerce").dt.strftime(
                "%Y-%m-%d %H:%M:%S" if col == "Timestamp" else "%Y-%m-%d"
            )

    cleaned_file = os.path.join(OUTPUT_DIR, filename.replace(".csv", "_clean.csv"))
    df.to_csv(cleaned_file, index=False)
    print(f"Cleaned {filename} → {cleaned_file}")
    return cleaned_file

# Clean all datasets
files_to_download = []
files_to_download.append(clean_and_save("/content/providers_data.csv"))
files_to_download.append(clean_and_save("/content/receivers_data.csv"))
files_to_download.append(clean_and_save("/content/food_listings_data.csv", date_cols=["Expiry_Date"]))
files_to_download.append(clean_and_save("/content/claims_data.csv", date_cols=["Timestamp"]))

# Download cleaned files
for f in files_to_download:
    files.download(f)


Cleaned /content/providers_data.csv → /content/providers_data_clean.csv
Cleaned /content/receivers_data.csv → /content/receivers_data_clean.csv
Cleaned /content/food_listings_data.csv → /content/food_listings_data_clean.csv
Cleaned /content/claims_data.csv → /content/claims_data_clean.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# --- Combine all cleaned datasets ---
import pandas as pd
from google.colab import files

# Load the already cleaned datasets
providers = pd.read_csv("/content/providers_data_clean.csv")
receivers = pd.read_csv("/content/receivers_data_clean.csv")
food_listings = pd.read_csv("/content/food_listings_data_clean.csv")
claims = pd.read_csv("/content/claims_data_clean.csv")

# Step 1: Merge food listings with providers (who donated the food)
merged = pd.merge(food_listings, providers, on="Provider_ID", how="left", suffixes=("", "_Provider"))

# Step 2: Merge claims with receivers (who claimed the food)
merged = pd.merge(merged, claims, on="Food_ID", how="left", suffixes=("", "_Claim"))
merged = pd.merge(merged, receivers, left_on="Receiver_ID", right_on="Receiver_ID", how="left", suffixes=("", "_Receiver"))

# Step 3: Save the consolidated dataset
final_path = "/content/food_wastage_system_combined.csv"
merged.to_csv(final_path, index=False)

# Step 4: Download combined dataset
files.download(final_path)

print("Combined dataset created and downloaded:", final_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Combined dataset created and downloaded: /content/food_wastage_system_combined.csv
