In [None]:
import os
import pandas as pd

base_dir = "GroundTruth_Data"   # root folder containing 39 station subfolders

all_data = []

for station_folder in os.listdir(base_dir):
    station_path = os.path.join(base_dir, station_folder)

    if os.path.isdir(station_path):  # only go into folders
        for file in os.listdir(station_path):
            if file.endswith(".csv") or file.endswith(".xlsx"):
                file_path = os.path.join(station_path, file)

                # Read CSV or Excel
                if file.endswith(".csv"):
                    df = pd.read_csv(file_path)
                else:
                    df = pd.read_excel(file_path)

                # Standardize
                df["datetime"] = pd.to_datetime(df["From Date"], errors="coerce")
                df["date"] = df["datetime"].dt.date
                df["Station Name"] = station_folder  # assign folder as station name

                all_data.append(df[["Station Name", "date", "NO2"]])

# Combine all stations
cpcb = pd.concat(all_data, ignore_index=True)

# Daily mean per station
cpcb_daily = cpcb.groupby(["Station Name","date"])["NO2"].mean().reset_index()

print(cpcb_daily.head())


FileNotFoundError: [Errno 2] No such file or directory: 'GroundTruth_Data'

In [None]:
import os
import pandas as pd
from google.colab import files
from google.colab import drive

# ============= OPTION 1: UPLOAD LOCAL FOLDER =============
# (You will upload a ZIP of your folder, we unzip it in Colab)
# def upload_local_folder():
#     print("Please upload a ZIP file containing the 'groundtruth' folder...")
#     uploaded = files.upload()
#     for fn in uploaded.keys():
#         print(f"Unzipping {fn} ...")
#         os.system(f"unzip -q {fn}")
#     return "groundtruth"   # assuming folder inside zip is named groundtruth


# ============= OPTION 2: USE GOOGLE DRIVE =============
def mount_drive_folder(folder_path="groundtruth"):
    drive.mount('/content/drive')
    base_dir = f"/content/drive/MyDrive/DELHI/GroundTruth_Data (1)"   # modify path if needed
    if not os.path.exists(base_dir):
        raise FileNotFoundError(f"{base_dir} not found in Drive!")
    return base_dir


# ============= MAIN PIPELINE =============
def load_groundtruth(base_dir):
    all_data = []

    for station_folder in os.listdir(base_dir):
        station_path = os.path.join(base_dir, station_folder)

        if os.path.isdir(station_path):
            for file in os.listdir(station_path):
                if file.endswith(".csv") or file.endswith(".xlsx"):
                    file_path = os.path.join(station_path, file)

                    # Read CSV or Excel
                    if file.endswith(".csv"):
                        df = pd.read_csv(file_path)
                    else:
                        df = pd.read_excel(file_path)

                    # Standardize
                    if "From Date" in df.columns:
                        df["datetime"] = pd.to_datetime(df["From Date"], errors="coerce")
                    elif "date" in df.columns:   # fallback
                        df["datetime"] = pd.to_datetime(df["date"], errors="coerce")
                    else:
                        continue

                    df["date"] = df["datetime"].dt.date
                    df["Station Name"] = station_folder

                    # Only keep NO2 column if present
                    if "NO2" in df.columns:
                        all_data.append(df[["Station Name", "date", "NO2"]])

    # Combine all stations
    cpcb = pd.concat(all_data, ignore_index=True)

    # Daily mean per station
    cpcb_daily = cpcb.groupby(["Station Name","date"])["NO2"].mean().reset_index()

    return cpcb_daily


# ============= RUN =============
# Uncomment ONE of the two options:

# 1. If you upload a ZIP manually
# base_dir = upload_local_folder()

# 2. If you use Google Drive
# base_dir = mount_drive_folder("groundtruth")

# Once chosen, load data
# cpcb_daily = load_groundtruth(base_dir)
# print(cpcb_daily.head())


In [None]:
import os
import pandas as pd
from google.colab import drive

# ============= MOUNT GOOGLE DRIVE =============
drive.mount('/content/drive')

# Path to your "groundtruth" folder in Google Drive
# Make sure you placed it inside MyDrive
base_dir = "/content/drive/MyDrive/DELHI/GroundTruth_Data (1)"

if not os.path.exists(base_dir):
    raise FileNotFoundError(f"{base_dir} not found! Please check path and folder name.")
else:
    print(f"✅ Base directory found: {base_dir}")

# ============= LOAD & PROCESS DATA =============
all_data = []

for station_folder in os.listdir(base_dir):
    station_path = os.path.join(base_dir, station_folder)
    print(f"Checking folder: {station_path}")

    if os.path.isdir(station_path):  # Only go into subfolders (stations)
        for file in os.listdir(station_path):
            print(f"  Checking file: {file}")
            if file.endswith(".csv") or file.endswith(".xlsx"):
                file_path = os.path.join(station_path, file)
                print(f"    Processing file: {file_path}")

                try:
                    # Read CSV or Excel
                    if file.endswith(".csv"):
                        df = pd.read_csv(file_path)
                    else:
                        df = pd.read_excel(file_path)

                    # Standardize datetime
                    date_column_found = False
                    if "From Date" in df.columns:
                        df["datetime"] = pd.to_datetime(df["From Date"], errors="coerce")
                        date_column_found = True
                    elif "date" in df.columns:
                        df["datetime"] = pd.to_datetime(df["date"], errors="coerce")
                        date_column_found = True
                    else:
                        print(f"      Skipping file: {file_path} - No 'From Date' or 'date' column found.")
                        continue  # skip if no recognizable date column

                    if not date_column_found:
                         print(f"      Skipping file: {file_path} - Datetime column not found or could not be parsed.")
                         continue


                    df["date"] = df["datetime"].dt.date
                    df["Station Name"] = station_folder  # Assign folder name as station

                    # Keep only required columns
                    if "NO2" in df.columns:
                        all_data.append(df[["Station Name", "date", "NO2"]])
                        print(f"      Added data from file: {file_path}")
                    else:
                        print(f"      Skipping file: {file_path} - No 'NO2' column found.")

                except Exception as e:
                    print(f"      Error processing file {file_path}: {e}")
                    continue


# Combine all stations into one dataframe
if all_data: # Check if all_data is not empty before concatenating
    cpcb = pd.concat(all_data, ignore_index=True)

    # Daily mean per station
    cpcb_daily = cpcb.groupby(["Station Name", "date"])["NO2"].mean().reset_index()

    # ============= SAVE OUTPUT =============
    output_path = "/content/drive/MyDrive/cpcb_daily.csv"
    cpcb_daily.to_csv(output_path, index=False)

    print("\n✅ Processing complete!")
    print(f"Saved file at: {output_path}")
    print(cpcb_daily.head())
else:
    print("\n❌ No data was processed from the specified directory. Please check the path, folder structure, file types, and column names.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Base directory found: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)
Checking folder: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)
  Checking file: oct (1).xlsx
    Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/oct (1).xlsx
      Skipping file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/oct (1).xlsx - No 'From Date' or 'date' column found.
  Checking file: june (1).xlsx
    Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/june (1).xlsx
      Skipping file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/june (1).xlsx - No 'From Date' or 'date' column found.
  Checking file: Jan (1).xlsx
    Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/Jan (1).xlsx
      Skipping file: /content/drive/MyDrive/DELHI/G

In [None]:
import os
import pandas as pd
from google.colab import drive

# ============= MOUNT GOOGLE DRIVE =============
drive.mount('/content/drive')

# Path to your "groundtruth" folder in Google Drive
base_dir = "/content/drive/MyDrive/DELHI/GroundTruth_Data (1)"

if not os.path.exists(base_dir):
    raise FileNotFoundError(f"{base_dir} not found! Please check path and folder name.")

# ============= LOAD & PROCESS DATA =============
all_data = []

for station_folder in os.listdir(base_dir):
    station_path = os.path.join(base_dir, station_folder)

    if os.path.isdir(station_path):  # Only go into subfolders (stations)
        for file in os.listdir(station_path):
            if file.endswith(".csv") or file.endswith(".xlsx"):
                file_path = os.path.join(station_path, file)
                print(f"Processing file: {file_path}")

                try:
                    if file.endswith(".csv"):
                        df = pd.read_csv(file_path)
                    else:
                        # Skip first 16 rows so that A17 becomes header
                        df = pd.read_excel(file_path, skiprows=16)

                    # Ensure proper datetime column
                    if "From Date" in df.columns:
                        df["datetime"] = pd.to_datetime(df["From Date"], errors="coerce")
                    elif "date" in df.columns:
                        df["datetime"] = pd.to_datetime(df["date"], errors="coerce")
                    else:
                        print(f"  Skipping file: {file_path} - No 'From Date' or 'date' column found.")
                        continue

                    df["date"] = df["datetime"].dt.date
                    df["Station Name"] = station_folder

                    # Ensure NO2 column exists
                    if "NO2" in df.columns:
                        all_data.append(df[["Station Name", "date", "NO2"]])
                    else:
                        print(f"  Skipping file: {file_path} - NO2 column not found.")

                except Exception as e:
                    print(f"  Error processing {file_path}: {e}")

# Combine all stations
if all_data:
    cpcb = pd.concat(all_data, ignore_index=True)
    # Daily mean per station
    cpcb_daily = cpcb.groupby(["Station Name","date"])["NO2"].mean().reset_index()

    # ============= SAVE OUTPUT =============
    output_path = "/content/drive/MyDrive/cpcb_daily.csv"
    cpcb_daily.to_csv(output_path, index=False)

    print("✅ Processing complete!")
    print(f"Saved file at: {output_path}")
    print(cpcb_daily.head())
else:
    print("⚠️ No valid files were processed. Check headers/columns again.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/oct (1).xlsx
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/june (1).xlsx
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/Jan (1).xlsx
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/dec (1).xlsx
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/sept (1).xlsx
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/may (1).xlsx
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/nov (1).xlsx
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/july (1).xlsx
Processing file: /content/drive/MyDrive/DELHI/GroundTruth_Data (1)/Najafgarh (1)/feb (1).xlsx
Processing file: /cont

In [None]:
import pandas as pd
import geopandas as gpd
import ast
import os
from shapely.geometry import Point

# --------------------
# STEP 1: Load CPCB daily groundtruth
# --------------------
cpcb = pd.read_csv("/content/drive/MyDrive/DELHI/cpcb_daily.csv")
cpcb["date"] = pd.to_datetime(cpcb["date"])
cpcb["month"] = cpcb["date"].dt.to_period("M")  # YYYY-MM
cpcb_monthly = cpcb.groupby(["Station Name","month"])["NO2"].mean().reset_index()

# --------------------
# STEP 2: Load station locations
# --------------------
stations = pd.read_csv("/content/drive/MyDrive/DELHI/delhi_station_population_2024.csv")
gdf_stations = gpd.GeoDataFrame(
    stations,
    geometry=gpd.points_from_xy(stations["Longitude"], stations["Latitude"]),
    crs="EPSG:4326"
)

# --------------------
# STEP 3: Function to parse one satellite file
# --------------------
def parse_satellite_file(file_path, month_str):
    sat = pd.read_csv(file_path)

    # Parse lat/lon from geo column
    def parse_coords(geo_str):
        try:
            geo_dict = ast.literal_eval(geo_str)
            return geo_dict["coordinates"][0], geo_dict["coordinates"][1]
        except:
            return None, None

    sat[["lon", "lat"]] = sat[".geo"].apply(lambda g: pd.Series(parse_coords(g)))

    gdf_sat = gpd.GeoDataFrame(
        sat,
        geometry=gpd.points_from_xy(sat["lon"], sat["lat"]),
        crs="EPSG:4326"
    )

    # nearest join
    joined = gpd.sjoin_nearest(gdf_stations, gdf_sat, how="left")

    # assign month
    joined["month"] = month_str

    return joined[["STATION NAME", "month", "tropospheric_NO2_column_number_density"]]

# --------------------
# STEP 4: Loop over all satellite monthly files
# --------------------
sat_dir = "/content/drive/MyDrive/DELHI/Delhi_NO2_Daily_Satellite"
all_sat = []

for file in os.listdir(sat_dir):
    if file.endswith(".csv"):
        # Extract month from filename
        # Example: "TROPOMI_NO2_Delhi_April2024.csv"
        month_name = file.split("_")[-1].replace(".csv","")  # e.g. "April2024"
        month_str = pd.to_datetime(month_name, format="%B%Y").to_period("M")  # YYYY-MM

        print(f"Processing {file} as {month_str}")
        sat_data = parse_satellite_file(os.path.join(sat_dir, file), month_str)
        all_sat.append(sat_data)

satellite_all = pd.concat(all_sat, ignore_index=True)

# --------------------
# STEP 5: Merge CPCB monthly averages with satellite
# --------------------
final = pd.merge(
    cpcb_monthly,
    satellite_all,
    left_on=["Station Name","month"],
    right_on=["STATION NAME","month"],
    how="inner"
)

final.to_csv("/content/drive/MyDrive/DELHI/merged_NO2_station_satellite.csv", index=False)
print("✅ Final merged dataset saved!")


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/DELHI/cpcb_daily.csv'