In [14]:
import pandas as pd
from datetime import datetime
import os

# === Parameters ===
# Dataset file path 
raw_data_path = r"C:\Users\aless\Desktop\gcd_project\green-city-dashboard-padova\data\raw\padua_1505_1708_raw.csv"
processed_data_path = r"C:\Users\aless\Desktop\gcd_project\green-city-dashboard-padova\data\processed\padua_1505_1708_clean.csv"

# ===  Raw data update ===
df = pd.read_csv(raw_data_path, sep=',', encoding='utf-8')  
print(f"📂 Raw ds: {df.shape[0]} rows, {df.shape[1]} cols")

# === Col rename ===
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)

# === Date to datetime ===
if "time" in df.columns:
    df["time"] = pd.to_datetime(df["time"], errors='coerce', format="%Y-%m-%dT%H:%M")
    
# === Remove missing name rows ===
df = df.dropna(subset=["time"])

# === Missing values ===
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].ffill()  
df[numeric_cols] = df[numeric_cols].bfill()  

# === Remove duplicates ===
df = df.drop_duplicates()

# === Data sorting ===
df = df.sort_values(by="time")

# === Clean ds save ===
df.to_csv(processed_data_path, index=False, encoding='utf-8')
print(f"✅ Light preprocessing done. File saved in: {processed_data_path}")


📂 Raw ds: 2280 rows, 8 cols
✅ Light preprocessing done. File saved in: C:\Users\aless\Desktop\gcd_project\green-city-dashboard-padova\data\processed\padua_1505_1708_clean.csv
