## from Chatgpt

In [5]:
# ----------------------------------------
# STEP 1: Import libraries
# ----------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report

# ----------------------------------------
# STEP 2: Load dataset
# ----------------------------------------
data = pd.read_csv("./datasets/ratnapark_pm25_after_imputation.csv")
print("Columns:", data.columns)
print(data.head())





Columns: Index(['Unnamed: 0', 'PM2.5', 'YEAR', 'MO', 'DY', 'HR', 'PS', 'WS2M', 'WD2M',
       'WS10M', 'WD10M', 'PRECTOTCORR', 'RH2M', 'QV2M', 'T2M', 'Unnamed: 15',
       'Unnamed: 16', 'Unnamed: 17'],
      dtype='object')
   Unnamed: 0      PM2.5  YEAR  MO  DY  HR     PS  WS2M   WD2M  WS10M  WD10M  \
0           0  75.780952  2022   1   1   0  88.03  0.47  317.6   0.77  319.2   
1           1  56.584127  2022   1   1   1  88.00  0.40  310.0   0.64  313.7   
2           2  49.538710  2022   1   1   2  87.96  0.26  319.6   0.46  323.9   
3           3  47.398438  2022   1   1   3  87.93  0.28  300.3   0.46  306.1   
4           4  42.821875  2022   1   1   4  87.91  0.29  329.0   0.51  330.4   

   PRECTOTCORR   RH2M  QV2M   T2M  Unnamed: 15  Unnamed: 16  Unnamed: 17  
0          0.0  83.08  6.70  8.85          NaN          NaN          NaN  
1          0.0  80.28  6.48  8.86          NaN          NaN          NaN  
2          0.0  78.29  6.27  8.74          NaN          NaN          

In [8]:
# ----------------------------------------
# STEP 3: Create Datetime and Date
# ----------------------------------------
# Combine Year, Month, Day, Hour columns into one datetime
data['Datetime'] = pd.to_datetime(dict(year=data['YEAR'], month=data['MO'], day=data['DY'], hour=data['HR']))
data['Date'] = data['Datetime'].dt.date

# ----------------------------------------
# STEP 4: Create daily peak PM2.5 and occurrence hour
# ----------------------------------------
daily_peak = data.groupby('Date').apply(
    lambda x: pd.Series({
        'peak_pm25': x['PM2.5'].max(),
        'peak_hour': x.loc[x['PM2.5'].idxmax(), 'HR']
    })
).reset_index()


In [12]:
# Meteorological feature columns in your dataset
meteo_cols = ['T2M', 'RH2M', 'WS2M', 'PS']   # Temperature, Humidity, Wind Speed, Pressure

# Ensure all exist
missing = [c for c in meteo_cols if c not in data.columns]
if missing:
    raise ValueError(f"⚠️ Missing columns in dataset: {missing}")

# Compute daily averages
daily_features = data.groupby('Date')[meteo_cols].mean().reset_index()

# Merge with peak info
df = pd.merge(daily_peak, daily_features, on='Date')

print("\n✅ Prepared daily dataset:")
print(df.head())


✅ Prepared daily dataset:
         Date   peak_pm25  peak_hour        T2M       RH2M      WS2M  \
0  2022-01-01  117.401562        9.0  10.721667  74.199583  1.054583   
1  2022-01-02   84.952381        9.0  10.407917  68.840833  1.001250   
2  2022-01-03   85.859375        9.0  10.692500  68.275417  1.221250   
3  2022-01-04   95.376562        9.0  10.615833  69.006250  1.030833   
4  2022-01-05   58.518182        9.0  11.260417  72.972083  0.914583   

          PS  
0  87.982083  
1  87.855000  
2  87.722500  
3  87.746667  
4  87.762917  


In [14]:
# ----------------------------------------
# STEP 5: Regression – predict daily peak PM2.5
# ----------------------------------------
X_reg = df[meteo_cols]
y_reg = df['peak_pm25']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train_r, y_train_r)
y_pred_r = regressor.predict(X_test_r)



In [15]:
print("\n--- Regression Results (Peak PM2.5) ---")
print("MAE:", mean_absolute_error(y_test_r, y_pred_r))
print("R²:", r2_score(y_test_r, y_pred_r))


--- Regression Results (Peak PM2.5) ---
MAE: 19.08899909530418
R²: 0.6029928372651501


In [26]:
# y_test_r.to_csv('realvalue.csv')
# y_pred_r.to_csv('predvalue.csv')
import numpy
# numpy.savetxt("foo.csv", y_test_r, delimiter=",")
numpy.savetxt("oof.csv", y_pred_r, delimiter=",")

In [None]:
# ----------------------------------------
# STEP 6: Classification – predict hour of peak occurrence
# ----------------------------------------
X_clf = df[meteo_cols]
y_clf = df['peak_hour']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_c, y_train_c)
y_pred_c = classifier.predict(X_test_c)

print("\n--- Classification Results (Peak Hour) ---")
print("Accuracy:", accuracy_score(y_test_c, y_pred_c))
print("\nClassification Report:\n", classification_report(y_test_c, y_pred_c))

In [27]:
numpy.savetxt("too.csv",y_pred_c,delimiter=",")