In [2]:
import pandas as pd


In [4]:
# Load the cleaned dataset from Module 2
df = pd.read_csv("/content/cleaned_featured_dataset.csv")

df.head()


Unnamed: 0,sensor_id,sensor_name,sensor_latitude,sensor_longitude,area_type,measurement_timestamp,pollutant,pollutant_value,pollutant_unit,date,...,educational,medical,transportation,building_density,aqi,pollution_category,traffic_influence,month,day_of_week,is_rush_hour
0,sensor_01,Pune_City_Center,18.5204,73.8567,urban,2025-08-27 00:00:00,PM2.5,0.884885,μg/m³,2025-08-27,...,83.0,166.0,38.0,10774.0,200,Unhealthy,Low,8.0,2.0,0.0
1,sensor_01,Pune_City_Center,18.5204,73.8567,urban,2025-08-27 00:00:00,PM10,0.348376,μg/m³,2025-08-27,...,83.0,166.0,38.0,10774.0,50,Good,Low,8.0,2.0,0.0
2,sensor_01,Pune_City_Center,18.5204,73.8567,urban,2025-08-27 00:00:00,NO2,-0.167272,μg/m³,2025-08-27,...,83.0,166.0,38.0,10774.0,50,Good,Low,8.0,2.0,0.0
3,sensor_01,Pune_City_Center,18.5204,73.8567,urban,2025-08-27 00:00:00,CO,-1.112479,mg/m³,2025-08-27,...,83.0,166.0,38.0,10774.0,50,Good,Low,8.0,2.0,0.0
4,sensor_01,Pune_City_Center,18.5204,73.8567,urban,2025-08-27 00:00:00,SO2,-0.449923,μg/m³,2025-08-27,...,83.0,166.0,38.0,10774.0,50,Good,Low,8.0,2.0,0.0


In [5]:
# Create pollution source column
def label_source(row):
    # Vehicular
    if (row['road_length_km'] > 0.5) and (row['pollutant'] == "NO2") and (row['pollutant_value'] > 1.0):
        return "Vehicular"

    # Industrial
    elif (row['industrial_area'] == 1) and (row['pollutant'] == "SO2") and (row['pollutant_value'] > 1.0):
        return "Industrial"

    # Agricultural
    elif (row['area_type'] == "farmland") and (row['season'] == "dry") and (row['pollutant'] == "PM") and (row['pollutant_value'] > 1.0):
        return "Agricultural"

    # Burning (optional rule → CO or PM sudden spikes)
    elif (row['pollutant'] in ["CO","PM"]) and (row['pollutant_value'] > 2.0):
        return "Burning"

    # Natural (default if no major source)
    else:
        return "Natural"

df['pollution_source'] = df.apply(label_source, axis=1)

In [6]:
# How many records in each pollution source
df['pollution_source'].value_counts()


Unnamed: 0_level_0,count
pollution_source,Unnamed: 1_level_1
Natural,2652
Vehicular,34


In [9]:
# If dataset is very small or imbalanced, simulate extra records
# Example: Randomly add synthetic samples for training
import numpy as np
sources = ["Vehicular","Industrial","Agricultural","Burning","Natural"]

simulated = pd.DataFrame({
    "sensor_id": np.random.randint(1000,2000, size=200),
    "pollutant": np.random.choice(["NO2","SO2","PM","CO"], size=200),
    "pollutant_value": np.random.uniform(0.5, 3.0, size=200),
    "pollution_source": np.random.choice(sources, size=200)
})

# Append simulated data
df_final = pd.concat([df, simulated], ignore_index=True)


In [10]:
# Final labeled dataset ready
df_final.to_csv("labeled_dataset.csv", index=False)
print("✅ Final labeled dataset saved as 'labeled_dataset.csv'")


✅ Final labeled dataset saved as 'labeled_dataset.csv'
