In [None]:
import pandas as pd
import geopandas as gpd
from sklearn.impute import KNNImputer

def load_data(sensor_file, shapefile):
    df_static = pd.read_csv(sensor_file)
    ggn_boundary = gpd.read_file(shapefile)
    return df_static, ggn_boundary

def perform_eda(df):
    print("\nData Summary:")
    print(df.info(), "\n")
    print("Missing Values:")
    print(df.isnull().sum(), "\n")
    print("Statistical Summary:")
    print(df.describe(), "\n")
    categorical_cols = df.select_dtypes(include=['object', 'category'])
    if not categorical_cols.empty:
        print("Categorical Column Summary:")
        print(categorical_cols.describe(), "\n")
    print("First Few Rows:")
    print(df.head(), "\n")

def filter_data_by_region(df, lat_min, lat_max, long_min, long_max):
    return df[(df['lat'].between(lat_min, lat_max)) & (df['long'].between(long_min, long_max))]

def impute_missing_values(df):
    df.dropna(subset=['lat', 'long', 'pm_25', 'pm_10'], inplace=True)
    df['lat'] = df.groupby('device_name')['lat'].transform(lambda x: x.fillna(x.median()))
    df['long'] = df.groupby('device_name')['long'].transform(lambda x: x.fillna(x.median()))
    df['pm_25'].fillna(df['pm_25'].median(), inplace=True)
    df['temp'].fillna(df['temp'].mean(), inplace=True)
    df[['pm_25', 'pm_10', 'no2', 'co']] = df[['pm_25', 'pm_10', 'no2', 'co']].interpolate(method='linear')
    df.fillna({'co2': -999, 'ch4': -999, 'co': -999}, inplace=True)
    df.drop(columns=['co2', 'ch4'], inplace=True)
    imputer = KNNImputer(n_neighbors=5)
    df[['pm_25', 'pm_10', 'no2', 'co']] = imputer.fit_transform(df[['pm_25', 'pm_10', 'no2', 'co']])
    df['rh'].fillna(df['rh'].mean(), inplace=True)
    return df

def assign_lat_long_medians(df):
    df['lat_median'] = df.groupby('device_name')['lat'].transform(lambda x: round(x.median(), 2))
    df['long_median'] = df.groupby('device_name')['long'].transform(lambda x: round(x.median(), 2))
    return df

def resample_data(df):
    df['data_created_time'] = pd.to_datetime(df['data_created_time']).dt.tz_localize(None)
    df.set_index('data_created_time', inplace=True)
    df_resampled = df.groupby('device_name').resample('H').median().reset_index()
    return df_resampled[['device_name', 'lat_median', 'long_median', 'data_created_time', 'id', 'pm_25', 'pm_10', 'no2', 'co', 'temp', 'rh']]

def main():
    sensor_file = '/Users/sumitchand/Downloads/Sensor Data/vayu_Gurugram_static_sensor_data_combined.csv'
    shapefile = '/Users/sumitchand/Downloads/Gurugram_ward.shp'
    
    df_static, ggn_boundary = load_data(sensor_file, shapefile)
    perform_eda(df_static)
    
    lat_min, lat_max = 28.3692, 28.5318
    long_min, long_max = 76.9282, 77.1658
    df_filtered = filter_data_by_region(df_static, lat_min, lat_max, long_min, long_max)
    print(f"Number of records in region: {df_filtered.shape[0]}")
    
    df_imputed = impute_missing_values(df_filtered)
    df_processed = assign_lat_long_medians(df_imputed)
    df_resampled = resample_data(df_processed)
    
    df_resampled_filled = df_resampled.bfill()
    print("Final DataFrame:", df_resampled_filled.info())
    return df_resampled_filled

if __name__ == "__main__":
    final_df = main()



import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd



Data Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15830720 entries, 0 to 15830719
Data columns (total 13 columns):
 #   Column             Dtype  
---  ------             -----  
 0   id                 int64  
 1   device_name        object 
 2   lat                float64
 3   long               float64
 4   pm_25              float64
 5   pm_10              float64
 6   no2                float64
 7   co                 float64
 8   co2                float64
 9   ch4                float64
 10  temp               float64
 11  rh                 float64
 12  data_created_time  object 
dtypes: float64(10), int64(1), object(2)
memory usage: 1.5+ GB
None 

Missing Values:
id                          0
device_name                 0
lat                    172492
long                   172492
pm_25                  156284
pm_10                  156304
no2                    686980
co                    5407700
co2                  11031820
ch4                  15562568
temp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['lat', 'long', 'pm_25', 'pm_10'], inplace=True)
