<a href="https://colab.research.google.com/github/Deepti-Shringare/Downscaling_of_no2map_XGBoost/blob/main/COLAB_CODES/Final_feature_Set_merging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***FINAL DATASET MERGING
***1.

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# =========================
# 1. Load datasets
# =========================
no2_path = "/content/drive/MyDrive/MAJOR_PROJECT/Ground_Satellite_Pop_Nighttime/MAJOR_DHYAND_CHAND_NATIONAL_STADIUM_2024_NO2_with_population_nightlights.csv"
met_path = "/content/drive/MyDrive/MAJOR_PROJECT/ERA5_LAND_U10_V10_Winddspeed_temp(c)_pressure/ERA5_LAND_Daily_Met_MAJOR DHYAND CHAND NATIONAL STADIUM_2024.csv"
blh_path = "/content/drive/MyDrive/MAJOR_PROJECT/ERA5_ATMOS/ERA5_ATMOS_DAILY_BLH_TCC_MAJOR DHYAND CHAND NATIONAL STADIUM_2024.csv"

no2_df = pd.read_csv(no2_path)
met_df = pd.read_csv(met_path)
blh_df = pd.read_csv(blh_path)

# FIX: Rename 'station_name' to 'station' in met_df to ensure consistency
if 'station_name' in met_df.columns and 'station' not in met_df.columns:
    met_df = met_df.rename(columns={'station_name': 'station'})

# =========================
# 2. Standardize columns
# =========================
for df_to_process in [no2_df, met_df, blh_df]:
    df_to_process['station'] = df_to_process['station'].str.strip().str.upper()
    df_to_process['date'] = pd.to_datetime(df_to_process['date'])

# Drop unwanted columns
met_df = met_df.drop(columns=['system:index', '.geo'], errors='ignore')

# =========================
# 3. Merge STEP 1: NO2 + Surface Meteorology
# =========================
df = no2_df.merge(
    met_df[['station','date',
            'u10','v10','wind_speed',
            'temperature_2m_C','surface_pressure_hPa']],
    on=['station','date'],
    how='left'
)

# =========================
# 4. Merge STEP 2: Add BLH + Cloud Cover
# =========================
df = df.merge(
    blh_df[['station','date','BLH','total_cloud_cover']],
    on=['station','date'],
    how='left'
)

# =========================
# 5. Final column order
# =========================
final_cols = [
    'station',
    'lat',
    'lon',
    'date',
    'groundtruth_no2',
    'tropospheric_no2',
    'population_density',
    'night_light',
    'u10',
    'v10',
    'wind_speed',
    'temperature_2m_C',
    'surface_pressure_hPa',
    'BLH',
    'total_cloud_cover'
]

df = df[final_cols]

# =========================
# 6. Save final dataset
# =========================
output_path = "/content/drive/MyDrive/MAJOR_PROJECT/Final_ML_features/MAJOR DHYAND CHAND NATIONAL STADIUM_FINAL_ML_FEATURES.csv"
df.to_csv(output_path, index=False)

print("✅ FINAL ML DATASET SAVED:")
print(output_path)
print(df.head())

In [None]:
import pandas as pd
import glob
import os # Import os module for directory listing

# ================================
# 1. Folder containing station CSVs
# ================================
input_path = "/content/drive/MyDrive/MAJOR_PROJECT/Final_ML_features/*.csv"
output_path = "/content/drive/MyDrive/MAJOR_PROJECT/Final_ML_features/MASTER_NO2_DATASET_2024.csv"

# ================================
# 2. Required columns (FINAL ORDER)
# ================================
final_columns = [
    "station",
    "lat",
    "lon",
    "date",
    "groundtruth_no2",
    "tropospheric_no2",
    "population_density",
    "night_light",
    "u10",
    "v10",
    "wind_speed",
    "temperature_2m_C",
    "surface_pressure_hPa",
    "BLH",
    "total_cloud_cover"
]

# ================================
# 3. Read and merge files
# ================================

# Diagnostic: Check directory contents before glob
folder_to_check = os.path.dirname(input_path.replace('*', ''))
print(f"Checking contents of directory: {folder_to_check}")
if os.path.exists(folder_to_check):
    print(f"Directory contents: {os.listdir(folder_to_check)}")
else:
    print(f"Directory does not exist: {folder_to_check}")

files = glob.glob(input_path)
print(f"Files found by glob: {len(files)}")

df_list = []

if not files:
    print("❌ No CSV files found to merge. Please ensure the previous step generated the files correctly and they are visible in the file system.")
else:
    for file in files:
        df = pd.read_csv(file)

        # Rename common inconsistencies
        df = df.rename(columns={
            "station_name": "station",
            "latitude": "lat",
            "longitude": "lon",
            "surface_pressure_hp": "surface_pressure_hPa"
        })

        # Keep only required columns
        df = df[final_columns]

        df_list.append(df)

    # ================================
    # 4. Combine into master dataset
    # ================================
    master_df = pd.concat(df_list, ignore_index=True)

    # Convert date & sort
    master_df["date"] = pd.to_datetime(master_df["date"])
    master_df = master_df.sort_values(["station", "date"])

    # ================================
    # 5. Save output
    # ================================
    master_df.to_csv(output_path, index=False)

    print("✅ MERGE COMPLETE")
    print("Final shape:", master_df.shape)
    print("Stations:", master_df["station"].nunique())


In [None]:
import pandas as pd
import glob
import os

# ==============================
# 1. Path to your folder
# ==============================
folder_path = "/content/drive/MyDrive/MAJOR_PROJECT/Final_ML_features"

# ==============================
# 2. Read all CSV files
# ==============================
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

print("Total files found:", len(csv_files))

df_list = []

for file in csv_files:
    df = pd.read_csv(file)

    # Ensure date is datetime
    df['date'] = pd.to_datetime(df['date'])

    df_list.append(df)

# ==============================
# 3. Vertical merge (row-wise)
# ==============================
master_df = pd.concat(df_list, ignore_index=True)

# ==============================
# 4. Sort properly
# ==============================
master_df = master_df.sort_values(
    by=["station", "date"]
).reset_index(drop=True)

# ==============================
# 5. Save master dataset
# ==============================
output_path = "/content/drive/MyDrive/MAJOR_PROJECT/MASTER_NO2_DATASET_2024.csv"
master_df.to_csv(output_path, index=False)

# ==============================
# 6. Quick sanity check
# ==============================
print("Merged dataset shape:", master_df.shape)
print("Unique stations:", master_df['station'].nunique())
print("Date range:", master_df['date'].min(), "to", master_df['date'].max())


In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# -----------------------------
# 1. File paths (update if needed)
# -----------------------------
file_1 = "/content/drive/MyDrive/DELHI_2024_FINAL_ML_FEATURES_MERGED37.csv"
file_2 = "/content/drive/MyDrive/MAJOR_PROJECT/Final_ML_features/MAJOR DHYAND CHAND NATIONAL STADIUM_FINAL_ML_FEATURES.csv"

# -----------------------------
# 2. Read CSV files
# -----------------------------
df1 = pd.read_csv(file_1)
df2 = pd.read_csv(file_2)

print("NSIT shape:", df1.shape)
print("Mandira Marg shape:", df2.shape)

# -----------------------------
# 3. Vertical merge (row-wise)
# -----------------------------
merged_df = pd.concat([df1, df2], axis=0, ignore_index=True)

print("Merged shape:", merged_df.shape)

# -----------------------------
# 4. Save merged file
# -----------------------------
output_path = "/content/drive/MyDrive/DELHI_2024_FINAL_ML_FEATURES_MERGED38.csv"
merged_df.to_csv(output_path, index=False)

print("✅ Merged file saved at:", output_path)
