In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
# Set random seed
np.random.seed(42)

In [None]:
# Load dataset
df = pd.read_csv("dataset.csv")

In [None]:
# Show dataset info
print("Dataset Shape:", df.shape)
print(df.head())

In [None]:
# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

In [None]:
# Drop missing values (or handle them as needed)
df = df.dropna()

In [None]:
# Split dataset
X = df.drop("AQI", axis=1)
y = df["AQI"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predict
y_pred = model.predict(X_test)

In [None]:
# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")


In [None]:
# Plot Actual vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.7, edgecolors='k')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel("Actual AQI")
plt.ylabel("Predicted AQI")
plt.title("Actual vs Predicted AQI")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Plot Feature Importance
importances = model.feature_importances_
features = X.columns

In [None]:
plt.figure(figsize=(10, 6))
plt.barh(features, importances, color='green')
plt.xlabel("Importance")
plt.title("Feature Importance in Predicting AQI")
plt.tight_layout()
plt.show()