In [39]:
import pandas as pd
df = pd.read_csv('https://drive.google.com/uc?id=1--gD_pPailfQ9CABN0wnlVF8qdQHBUYk')

In [40]:
# Drop rows with missing values
df_cleaned = df.dropna()

# Verify if there are any missing values left
print("Number of missing values:", df_cleaned.isnull().sum().sum())

# Define features and target
features = [
    'PM10 1H (µg/m3)', 'PM2.5 1H (µg/m3)', 'SO2 1H (ppm)', 'NO2 1H (ppm)',
    'O3 1H (ppm)', 'CO 1H (ppm)', 'WIND DIRECTION 1H (°)',
    'WIND SPEED 1H (m/s)', 'RELATIVE HUMIDITY 1H (%)', 'Ambient Temperature 1H (°c)'
]
X = df_cleaned[features]  # Feature matrix
y = df_cleaned['API']     # Target variable

# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Confirm shapes of the datasets
print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

Number of missing values: 0
Training set: (171808, 10) (171808,)
Validation set: (36816, 10) (36816,)
Test set: (36816, 10) (36816,)


In [41]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # Choose 'mean', 'median', or 'most_frequent'
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [42]:
X_train = X_train.dropna()
X_val = X_val.dropna()
X_test = X_test.dropna()

In [43]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [44]:
rf_default = RandomForestRegressor(random_state=42)

In [45]:
rf_default.fit(X_train, y_train)

In [46]:
joblib.dump(rf_default, "random_forest_model.pkl")

['random_forest_model.pkl']

In [13]:
pip install bz2file

Collecting bz2fileNote: you may need to restart the kernel to use updated packages.

  Downloading bz2file-0.98.tar.gz (11 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: bz2file
  Building wheel for bz2file (setup.py): started
  Building wheel for bz2file (setup.py): finished with status 'done'
  Created wheel for bz2file: filename=bz2file-0.98-py3-none-any.whl size=6890 sha256=887486d7a64044278ec6cc1ff5f2d61317176e5922b4656f2d100ad1648bebc5
  Stored in directory: c:\users\divaa\appdata\local\pip\cache\wheels\f8\dc\a2\d5648eee379349a57b03ccf386862f09826575087464f070b2
Successfully built bz2file
Installing collected packages: bz2file
Successfully installed bz2file-0.98




In [47]:
import bz2file as bz2

In [48]:
import joblib

In [49]:
def compressed_joblib(title, data):

    with bz2.BZ2File(title +  '.pbz2', 'w') as f:
         joblib.dump(data,f)

In [50]:
compressed_joblib('random_forest_model.pkl',rf_default)

In [51]:
def decompress_joblib(file):
    data = bz2.BZ2File(file, 'rb')
    data = joblib.load(data)
    return data

In [52]:
model = decompress_joblib('random_forest_model.pkl.pbz2')

In [31]:
import numpy as np

In [38]:
sample_input = np.random.rand(2,10)  # Replace with actual sample input
print(model.predict(sample_input))

[61.06 61.42]




In [10]:
y_pred_rf_test = rf_default.predict(X_test)

In [11]:
y_pred_rf_val = rf_default.predict(X_val)

In [12]:
mae_rf_test = mean_absolute_error(y_test, y_pred_rf_test)
mse_rf_test = mean_squared_error(y_test, y_pred_rf_test)
rmse_rf_test = np.sqrt(mse_rf_test)
r2_rf_test = r2_score(y_test, y_pred_rf_test)

print("\nEvaluation Metrics for Default Random Forest:")
print(f"  MAE: {mae_rf_test:.4f}")
print(f"  MSE: {mse_rf_test:.4f}")
print(f"  RMSE: {rmse_rf_test:.4f}")
print(f"  R²: {r2_rf_test:.4f}")

NameError: name 'np' is not defined

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Perform cross-validation
cv_scores_rf_default = cross_val_score(rf_default, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# Convert negative MSE to positive and calculate RMSE
mse_cv_rf_default = -cv_scores_rf_default
rmse_cv_rf_default = np.sqrt(mse_cv_rf_default)

# Mean and Standard Deviation of RMSE
mean_rmse_cv_rf_default = rmse_cv_rf_default.mean()
std_rmse_cv_rf_default = rmse_cv_rf_default.std()

print("\nCross-Validation Metrics for Default Random Forest:")
print(f"  Mean RMSE: {mean_rmse_cv_rf_default:.4f}")
print(f"  Std RMSE: {std_rmse_cv_rf_default:.4f}")