In [17]:
! pip3 install pandas scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scipy>=1.3.2 (from scikit-learn)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-

In [2]:
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

from sklearn.linear_model import LinearRegression

import pandas as pd
import sys
import os

In [3]:
path_to_package = os.path.abspath(os.path.join('..'))
if path_to_package not in sys.path:
    sys.path.append(path_to_package)
    
from src.utils.data_model_region import Region

# Global Constants

In [4]:
# Dataset
DATA_DIR_PATH = "./../data"
DATASET_PATH = f"{DATA_DIR_PATH}/PRSA2017_Data_20130301-20170228"
DATSET_PREPROCESSED_PATH = f"{DATA_DIR_PATH}/preprocessed"

# Model 
RANDOM_STATE = 1

# Helper Functions

In [5]:
def get_dataset_path(region:Region, base_path:str=DATASET_PATH)->str|None:
    """Get path to dataset

    Args:
        region (Region): Specifies the region of the dataset
        base_path (str, optional): root dir of the dataset. Defaults to DATASET_PATH.

    Raises:
        ValueError: incorrect region provided

    Returns:
        str|None: returns the datset file path for the region or raises error
    """
    if type(region) == Region:
        return f"{base_path}/PRSA_Data_{region.value}_20130301-20170228.csv"
    else:
        raise ValueError("Unkown REGION name!!!")

In [6]:
def get_preprocessed_dataset_path(region:Region, base_path:str=DATSET_PREPROCESSED_PATH)->str|None:
    """Get path to pre-processed dataset

    Args:
        region (Region): Specifies the region of the dataset
        base_path (str, optional): root dir of the dataset. Defaults to DATASET_PATH.

    Raises:
        ValueError: incorrect region provided

    Returns:
        str|None: returns the datset file path for the region or raises error
    """
    if type(region) == Region:
        return f"{base_path}/{region.value}.csv"
    else:
        raise ValueError("Unkown REGION name!!!")

In [7]:
def one_hot_wind_direction(wind_direction:str)->list:
    # TODO: fix later
    n = s = e = w = 0
    if type(wind_direction) != str:
        if wind_direction != wind_direction:
            return [float('nan'), float('nan'), float('nan'), float('nan')]
        raise ValueError("wind direction should be a string")
    if (len(wind_direction) > 4):
        raise ValueError("There should only be 4 wind directions ('N'|'S'|'E'|'W')")
    if ('N' in wind_direction) or ('n' in wind_direction):
        n = 1
    if ('S' in wind_direction) or ('s' in wind_direction):
        s = 1
    if ('E' in wind_direction) or ('e' in wind_direction):
        e = 1
    if ('W' in wind_direction) or ('w' in wind_direction):
        w = 1
    return [n,s,e,w]

one_hot_wind_direction('aES')

[0, 1, 1, 0]

# Dataset Exploration

In [8]:
dataset_files = sorted(os.listdir(DATASET_PATH))
print(f"List of files in the datset: {dataset_files}")

List of files in the datset: ['PRSA_Data_Aotizhongxin_20130301-20170228.csv', 'PRSA_Data_Changping_20130301-20170228.csv', 'PRSA_Data_Dingling_20130301-20170228.csv', 'PRSA_Data_Dongsi_20130301-20170228.csv', 'PRSA_Data_Guanyuan_20130301-20170228.csv', 'PRSA_Data_Gucheng_20130301-20170228.csv', 'PRSA_Data_Huairou_20130301-20170228.csv', 'PRSA_Data_Nongzhanguan_20130301-20170228.csv', 'PRSA_Data_Shunyi_20130301-20170228.csv', 'PRSA_Data_Tiantan_20130301-20170228.csv', 'PRSA_Data_Wanliu_20130301-20170228.csv', 'PRSA_Data_Wanshouxigong_20130301-20170228.csv']


In [9]:
print(f"Test generate datset path: {get_dataset_path(region=Region.AOTIZHONGXIN)}")

Test generate datset path: ./../data/PRSA2017_Data_20130301-20170228/PRSA_Data_Aotizhongxin_20130301-20170228.csv


In [10]:
assert len(Region) == len(dataset_files), "Region count mismatch"

In [11]:
# load all regions as DataFrames
regions_df = {}
for region in Region:
    region_dataset_path = get_dataset_path(region)
    region_df = pd.read_csv(region_dataset_path)
    # droping S.No and Station as it is not needed
    region_df.drop(
        ['No','station'], 
        axis=1, 
        inplace=True
    ) 
    # apply 1-hot encoding for wind direction
    region_df[["N","S","E","W"]] = region_df.apply(
        lambda row: one_hot_wind_direction(wind_direction=row['wd']), 
        axis=1, 
        result_type='expand'
    )
    # drop wind direcdtion
    region_df.drop(
        ['wd'], 
        axis=1, 
        inplace=True
    )
    regions_df[region] = region_df
    del region_dataset_path

In [12]:
regions_df[Region.AOTIZHONGXIN]

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM,N,S,E,W
0,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,4.4,1.0,0.0,0.0,1.0
1,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,4.7,1.0,0.0,0.0,0.0
2,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,5.6,1.0,0.0,0.0,1.0
3,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,3.1,1.0,0.0,0.0,1.0
4,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,2.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,2017,2,28,19,12.0,29.0,5.0,35.0,400.0,95.0,12.5,1013.5,-16.2,0.0,2.4,1.0,0.0,0.0,1.0
35060,2017,2,28,20,13.0,37.0,7.0,45.0,500.0,81.0,11.6,1013.6,-15.1,0.0,0.9,1.0,0.0,0.0,1.0
35061,2017,2,28,21,16.0,37.0,10.0,66.0,700.0,58.0,10.8,1014.2,-13.3,0.0,1.1,1.0,0.0,0.0,1.0
35062,2017,2,28,22,21.0,44.0,12.0,87.0,700.0,35.0,10.5,1014.4,-12.9,0.0,1.2,1.0,0.0,0.0,1.0


In [13]:
regions_df[Region.AOTIZHONGXIN].describe()

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM,N,S,E,W
count,35064.0,35064.0,35064.0,35064.0,34139.0,34346.0,34129.0,34041.0,33288.0,33345.0,35044.0,35044.0,35044.0,35044.0,35050.0,34983.0,34983.0,34983.0,34983.0
mean,2014.66256,6.52293,15.729637,11.5,82.773611,110.060391,17.375901,59.305833,1262.945145,56.353358,13.584607,1011.84692,3.123062,0.067421,1.708496,0.518852,0.373124,0.52091,0.382757
std,1.177213,3.448752,8.800218,6.922285,82.135694,95.223005,22.823017,37.1162,1221.436236,57.916327,11.399097,10.404047,13.688896,0.910056,1.204071,0.499652,0.483642,0.49957,0.486067
min,2013.0,1.0,1.0,0.0,3.0,2.0,0.2856,2.0,100.0,0.2142,-16.8,985.9,-35.3,0.0,0.0,0.0,0.0,0.0,0.0
25%,2014.0,4.0,8.0,5.75,22.0,38.0,3.0,30.0,500.0,8.0,3.1,1003.3,-8.1,0.0,0.9,0.0,0.0,0.0,0.0
50%,2015.0,7.0,16.0,11.5,58.0,87.0,9.0,53.0,900.0,42.0,14.5,1011.4,3.8,0.0,1.4,1.0,0.0,1.0,0.0
75%,2016.0,10.0,23.0,17.25,114.0,155.0,21.0,82.0,1500.0,82.0,23.3,1020.1,15.6,0.0,2.2,1.0,1.0,1.0,1.0
max,2017.0,12.0,31.0,23.0,898.0,984.0,341.0,290.0,10000.0,423.0,40.5,1042.0,28.5,72.5,11.2,1.0,1.0,1.0,1.0


In [14]:
regions_df[Region.AOTIZHONGXIN][regions_df[Region.AOTIZHONGXIN].columns[1:-2]].corr().style.background_gradient(cmap='coolwarm')


Unnamed: 0,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM,N,S
month,1.0,0.010522,0.0,0.025505,-0.027277,-0.249277,0.118648,0.05292,-0.096478,0.130377,-0.005658,0.273236,0.013497,-0.161326,0.019151,-0.043765
day,0.010522,1.0,-0.0,0.00792,0.032191,-0.018845,0.015714,-0.020578,0.003657,0.014323,0.022537,0.023425,-0.002491,-0.016707,-0.014282,0.01391
hour,0.0,-0.0,1.0,-0.010388,0.022421,0.002886,-0.043453,-0.047077,0.298161,0.141158,-0.037611,-0.013429,0.011531,0.155515,-0.204309,0.232694
PM2.5,0.025505,0.00792,-0.010388,1.0,0.879104,0.481025,0.684986,0.785006,-0.161448,-0.129415,-0.005727,0.120112,-0.014004,-0.27874,-0.090402,0.052282
PM10,-0.027277,0.032191,0.022421,0.879104,1.0,0.469045,0.652325,0.680404,-0.139914,-0.111147,-0.035594,0.061667,-0.027795,-0.183132,-0.099647,0.075569
SO2,-0.249277,-0.018845,0.002886,0.481025,0.469045,1.0,0.432081,0.540275,-0.200193,-0.350608,0.203921,-0.282565,-0.041609,-0.113115,-0.016109,0.022742
NO2,0.118648,0.015714,-0.043453,0.684986,0.652325,0.432081,1.0,0.690202,-0.50038,-0.241059,0.081636,0.067344,-0.040077,-0.49157,-0.047358,-0.014745
CO,0.05292,-0.020578,-0.047077,0.785006,0.680404,0.540275,0.690202,1.0,-0.326462,-0.366898,0.215556,-0.09812,-0.018429,-0.284888,-0.005328,-0.049262
O3,-0.096478,0.003657,0.298161,-0.161448,-0.139914,-0.200193,-0.50038,-0.326462,1.0,0.589063,-0.424294,0.294944,0.025099,0.3411,-0.28403,0.34316
TEMP,0.130377,0.014323,0.141158,-0.129415,-0.111147,-0.350608,-0.241059,-0.366898,0.589063,1.0,-0.827133,0.823099,0.036601,0.037913,-0.238465,0.249972


In [150]:
estimator = LinearRegression()
imputer = IterativeImputer(estimator=estimator, verbose=1, max_iter=500, tol=1e-5, imputation_order='roman')
imputer.fit(regions_df[Region.AOTIZHONGXIN])

[IterativeImputer] Completing matrix with shape (35064, 19)
[IterativeImputer] Change: 6256.486783885394, scaled tolerance: 0.1 
[IterativeImputer] Change: 551.4903627223525, scaled tolerance: 0.1 
[IterativeImputer] Change: 156.17030951164816, scaled tolerance: 0.1 
[IterativeImputer] Change: 140.56211580907984, scaled tolerance: 0.1 
[IterativeImputer] Change: 122.59430971977596, scaled tolerance: 0.1 
[IterativeImputer] Change: 104.7334324707517, scaled tolerance: 0.1 
[IterativeImputer] Change: 89.07678364944869, scaled tolerance: 0.1 
[IterativeImputer] Change: 75.71570924976274, scaled tolerance: 0.1 
[IterativeImputer] Change: 64.35762948285537, scaled tolerance: 0.1 
[IterativeImputer] Change: 54.704573472869725, scaled tolerance: 0.1 
[IterativeImputer] Change: 46.49990605329799, scaled tolerance: 0.1 
[IterativeImputer] Change: 39.5259849044688, scaled tolerance: 0.1 
[IterativeImputer] Change: 33.598083753950135, scaled tolerance: 0.1 
[IterativeImputer] Change: 28.559267190

In [177]:
# transform all datsets
columns = regions_df[Region.AOTIZHONGXIN].columns
for region in Region:
    print(f"Transforming region '{region.value}':", end=' ')
    regions_df[region] = pd.DataFrame(
        data=imputer.transform(regions_df[region]),
        columns=columns)
    
# TODO: round wind rirectionds

Transforming region 'Aotizhongxin': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Changping': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Dingling': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Dongsi': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Guanyuan': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Gucheng': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Huairou': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Nongzhanguan': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Shunyi': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Tiantan': [IterativeImputer] Completing matrix with shape (35064, 19)
Transforming region 'Wanliu': [IterativeImputer] Completing matrix with shape 

In [178]:
regions_df[Region.AOTIZHONGXIN].describe()

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM,N,S,E,W
count,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0,35064.0
mean,2014.66256,6.52293,15.729637,11.5,82.575349,109.972815,17.412495,59.44247,1268.219418,55.081988,13.582356,1011.849488,3.119446,0.067423,1.708547,0.518792,0.373038,0.521335,0.382427
std,1.177213,3.448752,8.800218,6.922285,81.407735,94.49453,22.615482,36.84281,1219.372486,57.549686,11.398007,10.402215,13.687569,0.909797,1.203877,0.499159,0.483171,0.499138,0.485611
min,2013.0,1.0,1.0,0.0,-27.00711,2.0,-11.432007,1.816481,-463.187412,-50.909429,-16.8,985.9,-35.3,-0.003366,0.0,-0.069056,0.0,0.0,-0.064896
25%,2014.0,4.0,8.0,5.75,22.0,38.0,3.0,31.0,500.0,7.0,3.1,1003.3,-8.1,0.0,0.9,0.0,0.0,0.0,0.0
50%,2015.0,7.0,16.0,11.5,59.0,87.0,9.0,54.0,900.0,41.0,14.5,1011.4,3.8,0.0,1.4,1.0,0.0,1.0,0.0
75%,2016.0,10.0,23.0,17.25,114.0,153.0,22.0,82.0,1600.0,81.474103,23.3,1020.1,15.6,0.0,2.2,1.0,1.0,1.0,1.0
max,2017.0,12.0,31.0,23.0,898.0,984.0,341.0,290.0,10000.0,423.0,40.5,1042.0,28.5,72.5,11.2,1.0,1.0,1.044486,1.0


In [179]:
# saveing datsets

In [215]:
for region in regions_df:
    regions_df[region].to_csv(get_preprocessed_dataset_path(region=region), index=False)