In [6]:
import boto3
import pandas as pd
from io import StringIO
import os
from dotenv import load_dotenv
import numpy as np
load_dotenv()

True

In [7]:
# Your AWS credentials and bucket info
bucket_name = 'my-feature-store-data'
s3_key = 'pipeline-data/data.csv'  # Example: "pipeline-data/data.csv"

# Create an S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id= os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
)

# Fetch the object from S3
response = s3.get_object(Bucket=bucket_name, Key=s3_key)

# Read the CSV content
csv_data = response['Body'].read().decode('utf-8')

# Convert to DataFrame
df = pd.read_csv(StringIO(csv_data))

# Done!
print(df.isnull().sum())

index                         0
aqi_index                     0
co                            0
no                            0
no2                           0
o3                            0
so2                           0
pm2_5                         0
pm10                          0
nh3                           0
temperature_2m                0
relative_humidity_2m          0
precipitation                 0
wind_speed_10m                0
wind_direction_10m            0
surface_pressure              0
dew_point_2m                  0
apparent_temperature          0
shortwave_radiation           0
et0_fao_evapotranspiration    0
year                          0
month                         0
day                           0
hour                          0
Calculated_AQI                0
dtype: int64


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Step 4: Define target and features
# Step 4: Define targets and features
target_columns = ["aqi_index", "Calculated_AQI"]
targets = df[target_columns]
X = df.drop(columns=target_columns)

# Step 5: Final check for datetime columns
X = X.select_dtypes(exclude=["datetime64[ns]"])

# Step 6: Split
X_train, X_test, y_train, y_test = train_test_split(X, targets, test_size=0.2, random_state=42)


In [9]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
import numpy as np

In [10]:
# Import additional required libraries
from sklearn.multioutput import MultiOutputRegressor

# ✅ Step 3: Define Models with MultiOutput capability
models = {
    "Random_Forest": RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42),
    "Gradient_Boosting": MultiOutputRegressor(GradientBoostingRegressor(n_estimators=300, max_depth=3, random_state=42)),
    "Linear_Regression": LinearRegression(),
    "Ridge_Regression": Ridge(alpha=1.0),
    "SVR": MultiOutputRegressor(SVR()),
    "Neural_Network": MultiOutputRegressor(MLPRegressor(max_iter=200, random_state=42))
}

# -----------------------
# ✅ Step 4: Train and Evaluate
results = []
best_model = None
best_model_name = None
best_avg_rmse = float("inf")

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics for each target
    model_results = {"Model": model_name}
    rmse_scores = []
    
    for i, col in enumerate(target_columns):
        rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
        mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
        r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
        
        rmse_scores.append(rmse)
        model_results[f"RMSE_{col}"] = rmse
        model_results[f"MAE_{col}"] = mae
        model_results[f"R²_{col}"] = r2
        
        print(f"  Target: {col}")
        print(f"    RMSE: {rmse}")
        print(f"    MAE: {mae}")
        print(f"    R²: {r2}")
    
    # Calculate average RMSE across all targets
    avg_rmse = np.mean(rmse_scores)
    model_results["Avg_RMSE"] = avg_rmse
    results.append(model_results)
    
    print(f"  Average RMSE: {avg_rmse}\n")
    
    if avg_rmse < best_avg_rmse:
        best_avg_rmse = avg_rmse
        best_model = model
        best_model_name = model_name

# -----------------------
# Step 5: Save Best Model Locally
print(f"\nBest model: {best_model_name} (Average RMSE = {best_avg_rmse:.2f})")
joblib.dump(best_model, "best_model.pkl")

S3_KEY = "models/best_model.pkl"

# Upload
s3.upload_file("best_model.pkl", bucket_name, S3_KEY)
print(f"Model uploaded to s3://{bucket_name}/{S3_KEY}")

# -----------------------
# Step 7: Summary
results_df = pd.DataFrame(results)
print("\nSummary of Model Performance:")
print(results_df)

Training Random_Forest...
  Target: aqi_index
    RMSE: 0.3251133628643251
    MAE: 0.20052905399796558
    R²: 0.8741970653873768
  Target: Calculated_AQI
    RMSE: 7.3012199883308915
    MAE: 1.6130456403737168
    R²: 0.9949732428020813
  Average RMSE: 3.8131666755976084

Training Gradient_Boosting...
  Target: aqi_index
    RMSE: 0.057114738483790624
    MAE: 0.01719769675896231
    R²: 0.9961174458941501
  Target: Calculated_AQI
    RMSE: 6.021153129142665
    MAE: 3.1203345172653214
    R²: 0.9965813362665047
  Average RMSE: 3.039133933813228

Training Linear_Regression...
  Target: aqi_index
    RMSE: 0.5878464857987662
    MAE: 0.47073217604526535
    R²: 0.5887092104702165
  Target: Calculated_AQI
    RMSE: 84.24911700233025
    MAE: 62.0492469971663
    R²: 0.33068931851285344
  Average RMSE: 42.41848174406451

Training Ridge_Regression...
  Target: aqi_index
    RMSE: 0.5899124503583473
    MAE: 0.4741457893723793
    R²: 0.5858131981393443
  Target: Calculated_AQI
    RMSE:



  Target: aqi_index
    RMSE: 2.378264076384967
    MAE: 1.7940469739235767
    R²: -5.731956224492621
  Target: Calculated_AQI
    RMSE: 33.13262619148422
    MAE: 18.027772420892347
    R²: 0.8964836949409979
  Average RMSE: 17.755445133934593


Best model: Gradient_Boosting (Average RMSE = 3.04)
Model uploaded to s3://my-feature-store-data/models/best_model.pkl

Summary of Model Performance:
               Model  RMSE_aqi_index  MAE_aqi_index  R²_aqi_index  \
0      Random_Forest        0.325113       0.200529      0.874197   
1  Gradient_Boosting        0.057115       0.017198      0.996117   
2  Linear_Regression        0.587846       0.470732      0.588709   
3   Ridge_Regression        0.589912       0.474146      0.585813   
4                SVR        0.424490       0.293597      0.785535   
5     Neural_Network        2.378264       1.794047     -5.731956   

   RMSE_Calculated_AQI  MAE_Calculated_AQI  R²_Calculated_AQI   Avg_RMSE  
0             7.301220            1.613046 