In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('./stations_day_dataset.csv')
df.head()
df.isnull().sum().sort_values(ascending=False)

NH3                     28310
PM10                    24885
Toluene                 24126
Benzene                 17514
SO2                      9562
O3                       7640
NOx                      3752
CO                       2950
AQI_Bucket               2873
AQI                      2873
NO                       2074
NO2                      1402
location                    0
temperature_2m              0
industry_distance           0
relative_humidity_2m        0
river_distance              0
road_distance               0
longitude                   0
latitude                    0
Unnamed: 0                  0
FullAddress                 0
State                       0
City                        0
StationName                 0
StationId                   0
PM2.5                       0
Date                        0
windspeed_10m               0
dtype: int64

In [2]:
import numpy as np
import pandas as pd

def impute_strategy(df):
    result = []
    for station_id,group in df.groupby('StationId'):
        numeric_columns = group.select_dtypes(include='number').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count = group[col].isnull().sum()
            null_pct = null_count / row_count
            if null_count == row_count:
                skew = np.nan
            else:
                skew = group[col].dropna().skew()
            result.append({
                'StationId': station_id,
                'Column': col,
                'Nulls': null_count,
                'TotalRows': row_count,
                'NullPct': null_pct,
                'Skew': skew
            })
    return pd.DataFrame(result)
summary_df = impute_strategy(df)

In [3]:
import pandas as pd
import numpy as np

def impute_values_station_wise(df):
    for station_id,group in df.groupby('StationId'):
        numeric_columns = group.select_dtypes(include='number').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count_col = group[col].isnull().sum()
            null_pct =  null_count_col/row_count
            
            if null_pct <= 0.5:
                skew_value = group[col].dropna().skew()
                if skew_value > 0.5 or skew_value < -0.5:
                    df.loc[group.index,col] = group[col].fillna(group[col].median())
                else:
                    df.loc[group.index,col] = group[col].fillna(group[col].mean())
impute_values_station_wise(df)
df.isnull().sum().sort_values(ascending=False)

NH3                     26938
PM10                    23576
Toluene                 19290
Benzene                 12471
SO2                      7620
O3                       5629
AQI_Bucket               2873
NOx                      1063
NO                        684
CO                        435
longitude                   0
road_distance               0
river_distance              0
Unnamed: 0                  0
location                    0
industry_distance           0
temperature_2m              0
relative_humidity_2m        0
latitude                    0
AQI                         0
FullAddress                 0
State                       0
City                        0
StationName                 0
StationId                   0
NO2                         0
PM2.5                       0
Date                        0
windspeed_10m               0
dtype: int64

In [4]:
import pandas as pd
import numpy as np

def impute_values_city_wise(df):
    for City,group in df.groupby('City'):
        numeric_columns = group.select_dtypes(include='number').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count_col = group[col].isnull().sum()
            null_pct =  null_count_col/row_count
            
            if null_pct <= 0.5:
                skew_value = group[col].dropna().skew()
                if skew_value > 0.5 or skew_value < -0.5:
                    df.loc[group.index,col] = group[col].fillna(group[col].median())
                else:
                    df.loc[group.index,col] = group[col].fillna(group[col].mean())
impute_values_city_wise(df)
df.isnull().sum().sort_values(ascending=False)

PM10                    18172
NH3                     12883
Toluene                  7565
AQI_Bucket               2873
Benzene                  2019
NOx                       379
CO                        225
O3                        158
latitude                    0
longitude                   0
Unnamed: 0                  0
road_distance               0
FullAddress                 0
river_distance              0
industry_distance           0
temperature_2m              0
relative_humidity_2m        0
location                    0
AQI                         0
State                       0
City                        0
StationName                 0
StationId                   0
SO2                         0
NO2                         0
NO                          0
PM2.5                       0
Date                        0
windspeed_10m               0
dtype: int64

In [None]:
import pandas as pd
import numpy as np

def impute_values_state_wise(df):
    for City,group in df.groupby('State'):
        numeric_columns = group.select_dtypes(include='state').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count_col = group[col].isnull().sum()
            null_pct =  null_count_col/row_count
            
            if null_pct <= 0.5:
                skew_value = group[col].dropna().skew()
                if skew_value > 0.5 or skew_value < -0.5:
                    df.loc[group.index,col] = group[col].fillna(group[col].median())
                else:
                    df.loc[group.index,col] = group[col].fillna(group[col].mean())
impute_values_state_wise(df)
df.isnull().sum().sort_values(ascending=False)

In [None]:
import pandas as pd
import numpy as np

def impute_values_remaining(df):
    for City,group in df.groupby('State'):
        numeric_columns = group.select_dtypes(include='state').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count_col = group[col].isnull().sum()
            null_pct =  null_count_col/row_count
            
            if null_pct <= 0.5:
                skew_value = group[col].dropna().skew()
                if skew_value > 0.5 or skew_value < -0.5:
                    df.loc[group.index,col] = group[col].fillna(group[col].median())
                else:
                    df.loc[group.index,col] = group[col].fillna(group[col].mean())
impute_values_remaining(df)
df.isnull().sum().sort_values(ascending=False)