# Global EDA

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
data = pd.read_csv('data/raw_data/Delhi.csv')
print(f'Shape of the dataset: {data.shape}')
data.head()

Shape of the dataset: (43825, 9)


Unnamed: 0,timestamp,location_name,co,no2,o3,pm10,pm25,so2,aqi
0,2019-01-01 00:00:00,Delhi,0.598089,24.943655,18.081507,108.281832,75.196148,4.378213,221.62967
1,2019-01-01 01:00:00,Delhi,1.040468,21.877824,55.547468,110.226722,65.451836,5.78011,211.475071
2,2019-01-01 02:00:00,Delhi,0.435431,24.014983,19.36457,44.680995,24.225843,4.823612,76.601125
3,2019-01-01 03:00:00,Delhi,0.641405,46.650006,49.052488,106.661734,65.056997,6.407425,211.063608
4,2019-01-01 04:00:00,Delhi,0.634266,45.782184,47.741928,113.304567,70.506701,9.329313,216.742773


### Sanity Checks

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43825 entries, 0 to 43824
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   timestamp      43825 non-null  object 
 1   location_name  43825 non-null  object 
 2   co             43825 non-null  float64
 3   no2            43825 non-null  float64
 4   o3             43825 non-null  float64
 5   pm10           43825 non-null  float64
 6   pm25           43825 non-null  float64
 7   so2            43825 non-null  float64
 8   aqi            43825 non-null  float64
dtypes: float64(7), object(2)
memory usage: 3.0+ MB


In [10]:
# timestamp should be datetime and sorted
df = data.copy()
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43825 entries, 0 to 43824
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   timestamp      43825 non-null  datetime64[ns]
 1   location_name  43825 non-null  object        
 2   co             43825 non-null  float64       
 3   no2            43825 non-null  float64       
 4   o3             43825 non-null  float64       
 5   pm10           43825 non-null  float64       
 6   pm25           43825 non-null  float64       
 7   so2            43825 non-null  float64       
 8   aqi            43825 non-null  float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 3.0+ MB


### Time Integrity

In [13]:
print(f'Monotonic increase: {df["timestamp"].is_monotonic_increasing}')
print(f'Unique timestamps: {df["timestamp"].nunique()}')
print(f'Total records: {len(df)}')

Monotonic increase: True
Unique timestamps: 43825
Total records: 43825


In [None]:
# check for missing hours 
expected = pd.date_range(
    start=df['timestamp'].min(),
    end=df['timestamp'].max(),
    freq='h'
)
missing_hours = expected.difference(df['timestamp'])
print(f'Missing hours: {missing_hours}')

Missing hours: DatetimeIndex([], dtype='datetime64[ns]', freq='h')


In [16]:
# check for missing days
expected = pd.date_range(
    start=df['timestamp'].min().normalize(),
    end=df['timestamp'].max().normalize(),
    freq='D'
)

actual = df['timestamp'].dt.normalize().unique()

missing_days = expected.difference(actual)
missing_days

DatetimeIndex([], dtype='datetime64[ns]', freq='D')

### Target- AQI

### Any sudden discontinuities?

In [22]:
rolling_mean = df.groupby("location_name")["aqi"].rolling(24).mean().reset_index(level=0, drop=True)
rolling_std = df.groupby("location_name")["aqi"].rolling(24).std().reset_index(level=0, drop=True)

z = (df["aqi"] - rolling_mean) / rolling_std
df[z.abs() > 5]

Unnamed: 0,timestamp,location_name,co,no2,o3,pm10,pm25,so2,aqi,aqi_diff
