# PM10

In [4]:
import pandas as pd
data = pd.read_excel('PM10.xlsx')

data.head()

Unnamed: 0,PM10,Tempreture,Humid,Wind Speed,Wind Direction Sin,Wind Direction COS,Pressure,RainFall
0,133.916667,19.0,47.125,1.875,-0.9397,-0.34202,1011.76,0.0
1,147.833333,20.7,45.0,2.625,0.766,-0.642788,1012.04,0.1
2,157.958333,19.9,54.375,5.42857,0.4999,-0.866025,1007.88,0.61
3,48.708333,16.6,91.25,2.875,0.9397,-0.34202,1008.66,22.0
4,47.541667,18.9,70.125,1.875,-0.5,0.866025,1007.7,1.0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.impute import KNNImputer
from math import sqrt
rmse = lambda y, yhat: np.sqrt(mean_squared_error(y, yhat))

In [3]:
def optimize_k(data, target):
    errors = []
    for k in range(1, 20, 2):
        imputer = KNNImputer(n_neighbors=k)
        imputed = imputer.fit_transform(data)
        df_imputed = pd.DataFrame(imputed, columns=data.columns)
        
        X = df_imputed.drop(target, axis=1)
        y = df_imputed[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = RandomForestRegressor(n_estimators=10, random_state=0)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        error = rmse(y_test, preds)
        errors.append({'K': k, 'RMSE': error})
        
    return errors

In [4]:
k_errors = optimize_k(data, target='PM10')
k_errors

[{'K': 1, 'RMSE': 95.83549279459801},
 {'K': 3, 'RMSE': 85.05935649255721},
 {'K': 5, 'RMSE': 84.34453368925482},
 {'K': 7, 'RMSE': 84.75359639788478},
 {'K': 9, 'RMSE': 83.6283244428201},
 {'K': 11, 'RMSE': 82.65099695247478},
 {'K': 13, 'RMSE': 83.6051716640284},
 {'K': 15, 'RMSE': 83.42420740480875},
 {'K': 17, 'RMSE': 83.48188703936003},
 {'K': 19, 'RMSE': 83.59384522700275}]

In [5]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=11)
imputed = imputer.fit_transform(data)
df_imputed = pd.DataFrame(imputed, columns=data.columns)

In [6]:
df_imputed.to_excel('PM10_process.xlsx')

# PM2.5

In [7]:
import pandas as pd
data1 = pd.read_excel('PM25.xlsx')

data1.head()

Unnamed: 0,TT,Tempreture,Humid,Wind Speed,Wind Direction Sin,Wind Direction COS,Pressure,RainFall
0,40.363636,19.0,47.125,1.875,-0.9397,-0.34202,1011.76,0.0
1,41.541667,20.7,45.0,2.625,0.766,-0.642788,1012.04,0.1
2,43.416667,19.9,54.375,5.42857,0.4999,-0.866025,1007.88,0.61
3,38.291667,16.6,91.25,2.875,0.9397,-0.34202,1008.66,22.0
4,23.681818,18.9,70.125,1.875,-0.5,0.866025,1007.7,1.0


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.impute import KNNImputer
from math import sqrt
rmse1 = lambda y1, yhat1: np.sqrt(mean_squared_error(y1, yhat1))

In [9]:
def optimize_k1(data1, target1):
    errors1 = []
    for k1 in range(1, 20, 2):
        imputer1 = KNNImputer(n_neighbors=k1)
        imputed1 = imputer1.fit_transform(data1)
        df_imputed1 = pd.DataFrame(imputed1, columns=data1.columns)
        
        X1 = df_imputed1.drop(target1, axis=1)
        y1 = df_imputed1[target1]
        X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

        model1 = RandomForestRegressor(n_estimators=5, random_state=0)
        model1.fit(X_train1, y_train1)
        preds1 = model1.predict(X_test1)
        error1 = rmse1(y_test1, preds1)
        errors1.append({'K': k1, 'RMSE': error1})
        
    return errors1

In [10]:
k_errors1 = optimize_k1(data1, target1='TT')
k_errors1

[{'K': 1, 'RMSE': 60.3943939416283},
 {'K': 3, 'RMSE': 54.89952773720319},
 {'K': 5, 'RMSE': 56.689705816889145},
 {'K': 7, 'RMSE': 55.710650600326325},
 {'K': 9, 'RMSE': 52.910005436647005},
 {'K': 11, 'RMSE': 59.072645800340425},
 {'K': 13, 'RMSE': 60.84033970882923},
 {'K': 15, 'RMSE': 62.89467956806421},
 {'K': 17, 'RMSE': 53.11009312657742},
 {'K': 19, 'RMSE': 56.26864961441388}]

In [11]:
from sklearn.impute import KNNImputer

imputer1 = KNNImputer(n_neighbors=9)
imputed1 = imputer1.fit_transform(data1)
df_imputed1 = pd.DataFrame(imputed1, columns=data1.columns)

In [12]:
df_imputed1.to_excel('PM25_process.xlsx')

# NOx

In [13]:
import pandas as pd
data2 = pd.read_excel('NOx.xlsx')

data2.head()

Unnamed: 0,NOx,Tempreture,Humid,Wind Speed,Wind Direction Sin,Wind Direction COS,Pressure,RainFall
0,10.434783,19.0,47.125,1.875,-0.9397,-0.34202,1011.76,0.0
1,14.041667,20.7,45.0,2.625,0.766,-0.642788,1012.04,0.1
2,12.541667,19.9,54.375,5.42857,0.4999,-0.866025,1007.88,0.61
3,9.958333,16.6,91.25,2.875,0.9397,-0.34202,1008.66,22.0
4,12.916667,18.9,70.125,1.875,-0.5,0.866025,1007.7,1.0


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.impute import KNNImputer
from math import sqrt
rmse2 = lambda y2, yhat2: np.sqrt(mean_squared_error(y2, yhat2))

In [15]:
def optimize_k2(data2, target2):
    errors2 = []
    for k2 in range(1, 20, 2):
        imputer2 = KNNImputer(n_neighbors=k2)
        imputed2 = imputer2.fit_transform(data2)
        df_imputed2 = pd.DataFrame(imputed2, columns=data2.columns)
        
        X2 = df_imputed2.drop(target2, axis=1)
        y2 = df_imputed2[target2]
        X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

        model2 = RandomForestRegressor(n_estimators=6, random_state=0)
        model2.fit(X_train2, y_train2)
        preds2 = model2.predict(X_test2)
        error2 = rmse2(y_test2, preds2)
        errors2.append({'K': k2, 'RMSE': error2})
        
    return errors2

In [16]:
k_errors2 = optimize_k2(data2, target2='NOx')
k_errors2

[{'K': 1, 'RMSE': 15.293453973704409},
 {'K': 3, 'RMSE': 14.908294574925247},
 {'K': 5, 'RMSE': 13.975034204542105},
 {'K': 7, 'RMSE': 15.06869139421989},
 {'K': 9, 'RMSE': 14.168674761367114},
 {'K': 11, 'RMSE': 14.010824707288434},
 {'K': 13, 'RMSE': 14.729963180516796},
 {'K': 15, 'RMSE': 14.403091334351146},
 {'K': 17, 'RMSE': 14.980009334469454},
 {'K': 19, 'RMSE': 14.916561387286995}]

In [17]:
from sklearn.impute import KNNImputer

imputer2 = KNNImputer(n_neighbors=5)
imputed2 = imputer2.fit_transform(data2)
df_imputed2 = pd.DataFrame(imputed2, columns=data2.columns)

In [18]:
df_imputed2.to_excel('NOx_process.xlsx')