In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from geopy.distance import geodesic

# Load dataset
df = pd.read_csv("/content/COMPREHENSIVE_EnviroScan_Pune_20250926_170535.csv")

In [None]:
# Step 3: Dataset information
print("Dataset Shape:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
df.info()

Dataset Shape: (10800, 40)

Missing values:
 sensor_id                   0
sensor_name                 0
sensor_latitude             0
sensor_longitude            0
area_type                   0
measurement_timestamp       0
pollutant                   0
pollutant_value             0
pollutant_unit              0
date                        0
hour                        0
is_weekend                  0
season                      0
timestamp_rounded           0
weather_timestamp           0
temperature_c               0
humidity_percent            0
pressure_hpa                0
wind_speed_ms               0
wind_direction_deg          0
precipitation_mm            0
weather_condition           0
visibility_km               0
road_edges               1080
road_length_km           1080
industrial_area          1080
commercial_area          1080
residential_area         1080
green_space              1080
water_body               1080
educational              1080
medical                  

In [None]:
# Step 4: Remove duplicate records
df.drop_duplicates(inplace=True)

# Remove invalid pollutant values (negative) & invalid GPS coordinates
df = df[(df['pollutant_value'] >= 0) &
        (df['sensor_latitude'].between(-90, 90)) &
        (df['sensor_longitude'].between(-180, 180))]

print("Shape after cleaning:", df.shape)

Shape after cleaning: (10800, 40)


In [None]:
# Step 5: Handle Missing Values
num_cols = ['road_length_km','building_density']
cat_cols = ['road_edges','industrial_area','commercial_area',
            'residential_area','green_space','water_body',
            'educational','medical','transportation']

# Fill numerical with median
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical with mode
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("Missing values after imputation:\n", df.isnull().sum())


Missing values after imputation:
 sensor_id                0
sensor_name              0
sensor_latitude          0
sensor_longitude         0
area_type                0
measurement_timestamp    0
pollutant                0
pollutant_value          0
pollutant_unit           0
date                     0
hour                     0
is_weekend               0
season                   0
timestamp_rounded        0
weather_timestamp        0
temperature_c            0
humidity_percent         0
pressure_hpa             0
wind_speed_ms            0
wind_direction_deg       0
precipitation_mm         0
weather_condition        0
visibility_km            0
road_edges               0
road_length_km           0
industrial_area          0
commercial_area          0
residential_area         0
green_space              0
water_body               0
educational              0
medical                  0
transportation           0
building_density         0
aqi                      0
pollution_category   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
# Step 6: Standardize timestamps
df['measurement_timestamp'] = pd.to_datetime(df['measurement_timestamp'])
df['weather_timestamp'] = pd.to_datetime(df['weather_timestamp'])

df[['measurement_timestamp','weather_timestamp']].head()


Unnamed: 0,measurement_timestamp,weather_timestamp
0,2025-08-27,2025-08-27
1,2025-08-27,2025-08-27
2,2025-08-27,2025-08-27
3,2025-08-27,2025-08-27
4,2025-08-27,2025-08-27


In [None]:
# Step 7: Normalize pollutant & weather values
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
scale_cols = ['pollutant_value','temperature_c','humidity_percent',
              'pressure_hpa','wind_speed_ms','precipitation_mm','visibility_km']

scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

df[scale_cols].head()


Unnamed: 0,pollutant_value,temperature_c,humidity_percent,pressure_hpa,wind_speed_ms,precipitation_mm,visibility_km
0,0.884885,-0.452455,0.38326,-1.389686,-0.253735,-0.673505,0.887578
1,0.348376,-0.452455,0.38326,-1.389686,-0.253735,-0.673505,0.887578
2,-0.167272,-0.452455,0.38326,-1.389686,-0.253735,-0.673505,0.887578
3,-1.112479,-0.452455,0.38326,-1.389686,-0.253735,-0.673505,0.887578
4,-0.449923,-0.452455,0.38326,-1.389686,-0.253735,-0.673505,0.887578


In [None]:
# Step 8: Derive temporal features (if not already present)
df['hour'] = df['measurement_timestamp'].dt.hour
df['day_of_week'] = df['measurement_timestamp'].dt.dayofweek
df['month'] = df['measurement_timestamp'].dt.month
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)  # Saturday(5), Sunday(6)

df[['measurement_timestamp','hour','day_of_week','month','is_weekend']].head()


Unnamed: 0,measurement_timestamp,hour,day_of_week,month,is_weekend
0,2025-08-27,0,2,8,0
1,2025-08-27,0,2,8,0
2,2025-08-27,0,2,8,0
3,2025-08-27,0,2,8,0
4,2025-08-27,0,2,8,0


In [None]:
# Step 9: Save final dataset
df.to_csv("cleaned_featured_dataset.csv", index=False)
print("✅ Cleaned and feature-engineered dataset saved as 'cleaned_featured_dataset.csv'")

✅ Cleaned and feature-engineered dataset saved as 'cleaned_featured_dataset.csv'
