In [3]:
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load and Preprocess Data
try:
    df = pd.read_csv("PB003.csv")
except FileNotFoundError:
    print("Error: PB003.csv not found. Please check the file path.")
    exit()

df['From Date'] = pd.to_datetime(df['From Date'], format="%d-%m-%Y %H:%M")
df['To Date'] = pd.to_datetime(df['To Date'], format="%d-%m-%Y %H:%M")

df['Hour'] = df['From Date'].dt.hour
df['DayOfWeek'] = df['From Date'].dt.dayofweek
df['Month'] = df['From Date'].dt.month

df_knn = df.drop(['From Date', 'To Date'], axis=1)
imputer = KNNImputer(n_neighbors=5)
df_imputed_knn = pd.DataFrame(imputer.fit_transform(df_knn), columns=df_knn.columns)

df_imputed = df_imputed_knn.copy()
df_imputed['From Date'] = df['From Date']
df_imputed['To Date'] = df['To Date']

target_column = 'PM10 (ug/m3)'  # Or PM2.5 if needed
features = [col for col in df_imputed.columns if col not in [target_column, 'From Date', 'To Date']]
X = df_imputed[features]
y = df_imputed[target_column]



# 2. Train-Test Split and Train Model 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

try:  # Try loading a pre-trained model
    model = joblib.load("trained_model_PM10.joblib")  # Use correct filename for PM2.5 if needed
    print("Loaded pre-trained model.")
except FileNotFoundError:  # If no saved model, train a new one
    print("Training new model...")
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    joblib.dump(model, "trained_model_PM10.joblib") # Use correct filename for PM2.5 if needed
    print("Trained and saved new model.")

# Save preprocessed DataFrame AND features
joblib.dump(df_imputed, "PB003_preprocessed_df.joblib")  # Save DataFrame
joblib.dump(features, "PB003_features.joblib")

# 3. Predict
y_pred = model.predict(X_test)  # Predictions on the test set

df_imputed['Predictions'] = model.predict(X)

# 4. Evaluate
rmse = mean_squared_error(y_test, y_pred, squared=False)
mse = mean_squared_error(y_test, y_pred, squared=True)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MSE: {mse}")
print(f"R-squared: {r2}")

Loaded pre-trained model.
RMSE: 38.326304942691806
MSE: 1468.9056505602025
R-squared: 0.7783514878987591


