In [1]:
# training.py 
import pandas as pd
import joblib
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split



# 1. Load and Preprocess Data
try:
    df = pd.read_csv("PB003.csv")
except FileNotFoundError:
    print("Error: PB003.csv not found. Please check the file path.")
    exit()

df['From Date'] = pd.to_datetime(df['From Date'], format="%d-%m-%Y %H:%M")
df['To Date'] = pd.to_datetime(df['To Date'], format="%d-%m-%Y %H:%M")

df['Hour'] = df['From Date'].dt.hour
df['DayOfWeek'] = df['From Date'].dt.dayofweek
df['Month'] = df['From Date'].dt.month

df_knn = df.drop(['From Date', 'To Date'], axis=1)
imputer = KNNImputer(n_neighbors=5)
df_imputed_knn = pd.DataFrame(imputer.fit_transform(df_knn), columns=df_knn.columns)

df_imputed = df_imputed_knn.copy()
df_imputed['From Date'] = df['From Date']
df_imputed['To Date'] = df['To Date']



# Train and Save PM2.5 Model
target_column = 'PM2.5 (ug/m3)'
features_pm25 = [col for col in df_imputed.columns if col not in [target_column, 'From Date', 'To Date']]
X_pm25 = df_imputed[features_pm25]
y_pm25 = df_imputed[target_column]
X_train_pm25, X_test_pm25, y_train_pm25, y_test_pm25 = train_test_split(X_pm25, y_pm25, test_size=0.2, random_state=42)
model_pm25 = RandomForestRegressor(n_estimators=100, random_state=42)
model_pm25.fit(X_train_pm25, y_train_pm25)
joblib.dump(model_pm25, "trained_model_pm25.joblib")



# Train and Save PM10 Model
target_column = 'PM10 (ug/m3)'
features_pm10 = [col for col in df_imputed.columns if col not in [target_column, 'From Date', 'To Date']]
X_pm10 = df_imputed[features_pm10]
y_pm10 = df_imputed[target_column]
X_train_pm10, X_test_pm10, y_train_pm10, y_test_pm10 = train_test_split(X_pm10, y_pm10, test_size=0.2, random_state=42)

model_pm10 = RandomForestRegressor(n_estimators=100, random_state=42)
model_pm10.fit(X_train_pm10, y_train_pm10)
joblib.dump(model_pm10, "trained_model_pm10.joblib")




# Save preprocessed DataFrame AND features
joblib.dump(df_imputed, "PB003_preprocessed_df.joblib")
joblib.dump(features_pm25, "PB003_features_pm25.joblib")
joblib.dump(features_pm10, "PB003_features_pm10.joblib")

print("Preprocessing and model training complete.  Files saved.")

Preprocessing and model training complete.  Files saved.
