# Batch Learning

# Import Libraries

In [285]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

import numpy as np
from rich import print

import pandas as pd

## Split data

In [286]:
data = pd.read_csv("./air.csv")
targets = data.pop("T")

data.describe()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
mean,-34.207524,1048.990061,-159.090093,1.865683,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032,39.48538,-6.837604
std,77.65717,329.83271,139.789093,41.380206,342.333252,257.433866,321.993552,126.940455,467.210125,456.938184,51.216145,38.97667
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
25%,0.6,921.0,-200.0,4.0,711.0,50.0,637.0,53.0,1185.0,700.0,34.1,0.6923
50%,1.5,1053.0,-200.0,7.9,895.0,141.0,794.0,96.0,1446.0,942.0,48.6,0.9768
75%,2.6,1221.0,-200.0,13.6,1105.0,284.0,960.0,133.0,1662.0,1255.0,61.9,1.2962
max,11.9,2040.0,1189.0,63.7,2214.0,1479.0,2683.0,340.0,2775.0,2523.0,88.7,2.231


In [287]:
# TODO: Test if its really necessary to do the preprocess

encoder = LabelEncoder()

# data["Time"] = encoder.fit_transform(data["Time"])
# data["Month"] = data.Date.map(lambda x: int(x.split("/")[1]))
# data["Season"] = (data.Month % 12)//3
# data["Time_cos"] = data.Time.map(np.cos)
# data["Time_sin"] = data.Time.map(np.sin)

# data = pd.get_dummies(data, columns = ["Month", "Season"])
# data = pd.get_dummies(data, columns = ["Season"])
data.drop(columns = ['Date'], inplace = True)


In [294]:
data.head()

Unnamed: 0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH
0,18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,48.9,0.7578
1,19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,47.7,0.7255
2,20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,54.0,0.7502
3,21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,60.0,0.7867
4,22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,59.6,0.7888


In [289]:
len_train_set = int(len(targets) * 0.75)

X_train = data[:len_train_set]
X_test = data[len_train_set:]
y_train = targets[:len_train_set]
y_test = targets[len_train_set:]

### Decision Tree Regressor 

In [290]:
numeric_features = X_train.select_dtypes(include=["int", "float"]).columns.tolist()

numeric_transformer = make_pipeline(StandardScaler())

preprocessor = make_column_transformer((numeric_transformer, numeric_features))

pipeline = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=42))
pipeline

In [291]:
param_grid = {
    "decisiontreeregressor__max_depth": [None, 10, 50],
    "decisiontreeregressor__min_samples_split": [2, 5],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

best_pipeline = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_pipeline.predict(X_test)

print(f"MAE is {mean_absolute_error(y_test, y_pred)}")
print("Best model:", best_pipeline)
print("Best hyperparameters:", best_params)

### Random Forest Regressor

In [292]:
numeric_features = X_train.select_dtypes(include=["int", "float"]).columns.tolist()

numeric_transformer = make_pipeline(StandardScaler())

preprocessor = make_column_transformer((numeric_transformer, numeric_features))

pipeline = make_pipeline(preprocessor, RandomForestRegressor(random_state=42))
pipeline

In [293]:
pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)

print(f"MAE is {mean_absolute_error(y_test, y_pred)}")

In [110]:
param_grid = {
    "randomforestregressor__n_estimators": [50, 100],
    "randomforestregressor__max_depth": [None, 10],
    "randomforestregressor__min_samples_split": [2, 5],
    "randomforestregressor__min_samples_leaf": [1, 2]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

best_pipeline = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_pipeline.predict(X_test)

print(f"MAE is {mean_absolute_error(y_test, y_pred)}")
print("Best model:", best_pipeline)
print("Best hyperparameters:", best_params)