# Air Delhi Dataset Modeling

The purpose of this notebook is to model additional parameters such as traffic, greenery, etc.

For statistical models, I will be only adding traffic

**Config**

In [1]:
DATA_FOLDER = '/kaggle/input/airdelhi-dataset/PM Datasets/'
TIME_INTERVAL = 30
TRAFFIC_LAG = 15

In [2]:
import tqdm
import torch
import argparse
import os
import math
import pickle as pkl
import pandas as pd
import numpy as np
from haversine import haversine, haversine_vector, Unit
from sklearn.model_selection import KFold

mytqdm = tqdm.notebook.tqdm if 0 else tqdm.tqdm


In [3]:
df = pd.read_csv('/kaggle/input/airdelhi-dataset/PM Datasets/2020-11-03_all.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,uid,dateTime,deviceId,lat,long,pm1_0,pm2_5,pm10
0,0,2c9b97a9-be40-416e-8c95-da9b53533eea,2020-11-03 00:00:02+05:30,0000000024568afd,28.579224,77.22657,113.0,185.0,198.0
1,1,e5b872ff-b4e6-400e-aabf-7db7cbce5ce7,2020-11-03 00:00:05+05:30,0000000024568afd,28.579226,77.226565,117.0,187.0,201.0
2,2,6e8cd4f9-6b88-43ec-8831-22b3ea576e34,2020-11-03 00:00:06+05:30,00000000d4bc37f2,28.57935,77.23481,121.0,183.0,195.0
3,3,66bdf0ae-866d-4603-bb7a-35b20baacd7f,2020-11-03 00:00:07+05:30,0000000024568afd,28.579227,77.226562,117.0,187.0,203.0
4,4,e6bfb221-13c1-4062-95e5-71a96701c919,2020-11-03 00:00:09+05:30,00000000d4bc37f2,28.57935,77.23481,119.0,181.0,198.0


In [4]:
df.shape

(96213, 9)

In [5]:
rand_sensor_locs = 1

fold = [3,5][rand_sensor_locs]

heatmap = 0
datadir = DATA_FOLDER
pm = ['pm10','pm2_5'][1]
pm2 = ['pm10','pm2_5'][0]
sensorTyp = ['random'][0]

km = 1
grouptime = 30
latlongprecise = 4
metricprecise = 3

ll_min, ll_max = [11.0, 28.48, 77.1], [45.0, 28.72, 77.33]
ll_off = [1, np.round(0.00902*km,latlongprecise), np.round(0.01017*km,latlongprecise)]

## Preprocessing

In [6]:
def to_dt(time_string):
    return pd.to_datetime(time_string).tz_localize('UTC').tz_convert('Asia/Kolkata')

def filter_fields(df):
    df = df[['dateTime', 'deviceId', 'lat','long',pm,pm2]]
    df = df.rename(columns = {'long' : 'lon'})
    return df

def process_time(df, hour=None):
    df = df.copy()
    df['timeOfDay'] = df['dateTime'].copy()
    if hour:
        df["hour"] = df.timeOfDay.dt.hour
        df = df[df.hour.isin(hour)]
        df = df.drop("hour", axis = 1)

    df.timeOfDay = df.timeOfDay.dt.round('{}min'.format(grouptime))
    df.timeOfDay = pd.to_datetime(df.timeOfDay)
    df.timeOfDay = df.timeOfDay.dt.hour*60+df.timeOfDay.dt.minute
    df.timeOfDay %= 1440
    df = df[(df.timeOfDay>=300) & (df.timeOfDay<=1350)]
    df = df.sort_values(by = ['dateTime', 'deviceId']) # , 'lat','long'
    df = df.reset_index(drop = True)

    df.dateTime = pd.to_datetime(df.dateTime)
    df['day_of_week'] = df.dateTime.dt.weekday
    df['date_value'] = df.dateTime.dt.date

    df = df.drop(columns = ['dateTime'])

    return df

def distance_approximation(lat1, lon1, lat2, lon2):
    """
    Computes the approximate distance (in km) between two close latitude/longitude points using
    the Euclidean approximation.
    """

    def to_radians(x):
        x = x / 360 * 2 * np.pi
        return x

    lat1 = to_radians(lat1)
    lat2 = to_radians(lat2)
    lon1 = to_radians(lon1)
    lon2 = to_radians(lon2)
    
    lat_factor = 110.57  # Approximate km per degree latitude
    lon_factor = 111.32 * np.cos((lat1 + lat2) / 2)
    
    dlat = (lat2 - lat1) * lat_factor
    dlon = (lon2 - lon1) * lon_factor
    
    return np.sqrt(dlat**2 + dlon**2)

def haversine(lat1, lon1, lat2, lon2):
    """
    Computes the distance (in km) between two latitude/longitude points using the Haversine formula.
    """
    R = 6371  # Radius of the Earth in km
    
    # Convert degrees to radians
    def to_radians(x):
        x = x / 180 * np.pi
        return x

    lat1 = to_radians(lat1)
    lat2 = to_radians(lat2)
    lon1 = to_radians(lon1)
    lon2 = to_radians(lon2)
    
    # Differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    # Haversine formula
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c  # Distance in km

def record_distance(df):

    speed_df = df.copy()
    speed_df = speed_df.sort_values(by = ['deviceId', 'dateTime'])
    speed_df['next_lat'] = speed_df.groupby(['deviceId'])['lat'].shift(-1)
    speed_df['next_lon'] = speed_df.groupby(['deviceId'])['lon'].shift(-1)
    speed_df['is_na'] = speed_df['next_lon'].notna()

    speed_df['next_lat'] = speed_df['next_lat'].where(speed_df['next_lat'].notna(), speed_df['lat'])
    speed_df['next_lon'] = speed_df['next_lon'].where(speed_df['next_lon'].notna(), speed_df['lon'])

    distance = haversine( # distance_approximation( # 
        np.array(speed_df['lat']), np.array(speed_df['lon']),
        np.array(speed_df['next_lat']), np.array(speed_df['next_lon'])
    )

    speed_df['distance'] = distance

    return speed_df[['dateTime', 'deviceId', 'distance']] # 'lat', 'lon',

def aggregate_speed(speed_df):
    # After time is processed
    speed_data = speed_df[['timeOfDay', 'deviceId', 'distance']].groupby(
        ['timeOfDay', 'deviceId'])['distance'].sum()
    
    speed_data = speed_data.reset_index()
    speed_df = speed_df.drop(columns = ['distance'])
    speed_df = pd.merge(speed_df, speed_data, how='outer', on = ['timeOfDay', 'deviceId'])

    speed_df_mean_distance = speed_df[['deviceId', 'distance']].groupby(
        ['deviceId'])['distance'].mean().reset_index()
    speed_df_mean_distance = speed_df_mean_distance.rename(columns = {'distance': 'mean_dist'})
    speed_df = pd.merge(speed_df, speed_df_mean_distance, how = 'outer', on = 'deviceId')

    speed_df['distance'] = speed_df['distance'].where(
        speed_df['distance'] > 1, 
        speed_df['mean_dist']
    )

    speed_df = speed_df.drop(columns = ['mean_dist'])
    
    return speed_df

def round_coordinates(df):
    def round_val(val, min, off):
        val1 = ((val - min) / off).astype(int)
        val2 = (val1 * off) + min
        return round(val2, latlongprecise)
    df.lat = round_val(df.lat.astype(float), ll_min[1], ll_off[1])
    df.lon = round_val(df.lon.astype(float), ll_min[2], ll_off[2])

    return df

def count_buses(df):
    bus_count = df.groupby(['timeOfDay', 'lat', 'lon'])['deviceId'].nunique().reset_index()
    bus_count = bus_count.rename(columns = {'deviceId' : 'bus_count'})
    return bus_count

def preprocess_data(date, data_dir = "./data/", hour = None):
    file = date + "_all.csv"
    print('Reading', file)
    df = pd.read_csv(data_dir + file, index_col = 0, parse_dates = ["dateTime"])
    df = df[(df.dateTime >= to_dt(date)) & (df.dateTime <= to_dt(date+ " 18:00:00"))].reset_index(drop = True)
    df = filter_fields(df)
    
    speed_data = record_distance(df)

    df = pd.merge(df, speed_data, how='outer', on = ['dateTime', 'deviceId'])

    df = process_time(df)

    df = aggregate_speed(df)

    df = round_coordinates(df)

    bus_count = count_buses(df)
    # return bus_count

    # meaning pm values
    df = df.drop(columns = ['deviceId'])
    # 'deviceId', 
    df = df.groupby(['date_value', 'timeOfDay', 'lat','lon']).mean().reset_index()
    df = pd.merge(df, bus_count, how = 'outer', on = ['timeOfDay', 'lat', 'lon'])

    # df = df.groupby(['dateTime','lat','long']).mean().reset_index()
    
    df.loc[:, pm] = df.loc[:, pm].round(2)
    df.loc[:, pm2] = df.loc[:, pm2].round(2)

    return df


In [7]:
df = preprocess_data('2020-11-12', data_dir = datadir, hour = None)

Reading 2020-11-12_all.csv


In [8]:
df.head(5)

Unnamed: 0,date_value,timeOfDay,lat,lon,pm2_5,pm10,day_of_week,distance,bus_count
0,2020-11-12,330,28.489,77.2938,165.02,177.14,3.0,6.505617,1
1,2020-11-12,330,28.525,77.2836,214.07,233.64,3.0,7.71088,1
2,2020-11-12,330,28.534,77.2836,219.0,243.0,3.0,7.71088,1
3,2020-11-12,330,28.543,77.2734,239.44,264.48,3.0,7.71088,1
4,2020-11-12,330,28.552,77.253,212.07,228.0,3.0,7.890497,1


In [9]:
def get_tabular_dataset(datadir = None):
    if datadir is None:
        datadir = DATA_FOLDER
    files = files = [f.name for f in os.scandir(datadir) if f.is_file()]

    df = None
    for f in files:
        date_val = f[:10]
        # print(f"Reading {f}")
        if df is None:
            df = preprocess_data(date_val, data_dir = datadir, hour = None)
        else:
            df1 = preprocess_data(date_val, data_dir = datadir, hour = None)
            df = pd.concat([df, df1])
    return df.reset_index()   

In [10]:
df = get_tabular_dataset()

Reading 2020-12-09_all.csv
Reading 2020-11-26_all.csv
Reading 2021-01-29_all.csv
Reading 2020-12-26_all.csv
Reading 2020-12-25_all.csv
Reading 2020-12-05_all.csv
Reading 2020-12-27_all.csv
Reading 2020-11-07_all.csv
Reading 2020-12-21_all.csv
Reading 2020-12-16_all.csv
Reading 2021-01-15_all.csv
Reading 2020-12-06_all.csv
Reading 2020-12-13_all.csv
Reading 2021-01-04_all.csv
Reading 2020-11-22_all.csv
Reading 2020-11-09_all.csv
Reading 2021-01-23_all.csv
Reading 2020-11-03_all.csv
Reading 2020-11-28_all.csv
Reading 2020-11-27_all.csv
Reading 2021-01-20_all.csv
Reading 2020-12-11_all.csv
Reading 2020-12-31_all.csv
Reading 2020-12-17_all.csv
Reading 2020-12-08_all.csv
Reading 2020-11-23_all.csv
Reading 2021-01-21_all.csv
Reading 2021-01-17_all.csv
Reading 2020-12-02_all.csv
Reading 2020-11-10_all.csv
Reading 2020-11-19_all.csv
Reading 2021-01-26_all.csv
Reading 2021-01-14_all.csv
Reading 2020-11-21_all.csv
Reading 2021-01-08_all.csv
Reading 2020-11-15_all.csv
Reading 2020-11-06_all.csv
R

In [11]:
df = df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])

In [12]:
df.shape

(123165, 10)

In [13]:
try:
    df = df.drop(columns = ['index'])
except:
    pass

In [14]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,date_value,timeOfDay,lat,lon,pm2_5,pm10,day_of_week,distance,bus_count
0,2020-11-01,330,28.543,77.2734,481.37,522.53,6.0,7.76689,1
1,2020-11-01,330,28.552,77.2632,471.18,513.5,6.0,7.76689,1
2,2020-11-01,330,28.561,77.253,462.44,503.81,6.0,7.76689,1
3,2020-11-01,330,28.561,77.2632,468.14,507.55,6.0,7.76689,1
4,2020-11-01,330,28.57,77.253,462.68,505.21,6.0,7.76689,1


In [15]:
df.to_csv('tabular_data.csv')