In [1]:
import boto3
import pandas as pd
from io import StringIO
import os
from dotenv import load_dotenv
import numpy as np
load_dotenv()

True

In [2]:
# Your AWS credentials and bucket info
bucket_name = 'my-feature-store-data'
s3_key = 'pipeline-data/data.csv'  # Example: "pipeline-data/data.csv"

# Create an S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id= os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
)

# Fetch the object from S3
response = s3.get_object(Bucket=bucket_name, Key=s3_key)

# Read the CSV content
csv_data = response['Body'].read().decode('utf-8')

# Convert to DataFrame
df = pd.read_csv(StringIO(csv_data))

# Done!
print(df.isnull().sum())

index                         0
aqi_index                     0
co                            0
no                            0
no2                           0
o3                            0
so2                           0
pm2_5                         0
pm10                          0
nh3                           0
temperature_2m                0
relative_humidity_2m          0
precipitation                 0
wind_speed_10m                0
wind_direction_10m            0
surface_pressure              0
dew_point_2m                  0
apparent_temperature          0
shortwave_radiation           0
et0_fao_evapotranspiration    0
year                          0
month                         0
day                           0
hour                          0
Calculated_AQI                0
dtype: int64


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Step 4: Define target and features
# Step 4: Define targets and features
target_columns = ["aqi_index", "Calculated_AQI"]
targets = df[target_columns]
X = df.drop(columns=target_columns)

# Step 5: Final check for datetime columns
X = X.select_dtypes(exclude=["datetime64[ns]"])

# Step 6: Split
X_train, X_test, y_train, y_test = train_test_split(X, targets, test_size=0.2, random_state=42)


In [4]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import joblib
import numpy as np

In [5]:
# Import additional required libraries
from io import BytesIO
from sklearn.multioutput import MultiOutputRegressor

# ✅ Step 3: Define Models with MultiOutput capability
models = {
    "Random_Forest": RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42),
    "Gradient_Boosting": MultiOutputRegressor(GradientBoostingRegressor(n_estimators=300, max_depth=3, random_state=42)),
    "Linear_Regression": LinearRegression(),
    "Ridge_Regression": Ridge(alpha=1.0),
    "SVR": MultiOutputRegressor(SVR()),
    "Neural_Network": MultiOutputRegressor(MLPRegressor(max_iter=200, random_state=42))
}

# -----------------------
# ✅ Step 4: Train and Evaluate
results = []
best_model = None
best_model_name = None
best_avg_rmse = float("inf")

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate metrics for each target
    model_results = {"Model": model_name}
    rmse_scores = []
    
    for i, col in enumerate(target_columns):
        rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
        mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
        r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
        
        rmse_scores.append(rmse)
        model_results[f"RMSE_{col}"] = rmse
        model_results[f"MAE_{col}"] = mae
        model_results[f"R²_{col}"] = r2
        
        print(f"  Target: {col}")
        print(f"    RMSE: {rmse}")
        print(f"    MAE: {mae}")
        print(f"    R²: {r2}")
    
    # Calculate average RMSE across all targets
    avg_rmse = np.mean(rmse_scores)
    model_results["Avg_RMSE"] = avg_rmse
    results.append(model_results)
    
    print(f"  Average RMSE: {avg_rmse}\n")
    
    if avg_rmse < best_avg_rmse:
        best_avg_rmse = avg_rmse
        best_model = model
        best_model_name = model_name

# -----------------------
# Step 5: Save Best Model Locally
print(f"\nBest model: {best_model_name} (Average RMSE = {best_avg_rmse:.2f})")
# joblib.dump(best_model, "best_model.pkl")


# -----------------------
# Step 7: Summary
results_df = pd.DataFrame(results)
print("\nSummary of Model Performance:")
print(results_df)

Training Random_Forest...
  Target: aqi_index
    RMSE: 0.323664506854938
    MAE: 0.1996290739600245
    R²: 0.8755245480051466
  Target: Calculated_AQI
    RMSE: 7.300486707695632
    MAE: 1.6121383883111136
    R²: 0.9949740847488451
  Average RMSE: 3.8120756072752853

Training Gradient_Boosting...
  Target: aqi_index
    RMSE: 0.049020969329610885
    MAE: 0.016204030706548928
    R²: 0.997144663171603
  Target: Calculated_AQI
    RMSE: 6.013866334993563
    MAE: 3.117307613737743
    R²: 0.9965894919863428
  Average RMSE: 3.031443652161587

Training Linear_Regression...
  Target: aqi_index
    RMSE: 0.5855752618949703
    MAE: 0.4690884869791563
    R²: 0.5925643745372393
  Target: Calculated_AQI
    RMSE: 84.23190155931523
    MAE: 62.0356708762222
    R²: 0.3309404991271254
  Average RMSE: 42.4087384106051

Training Ridge_Regression...
  Target: aqi_index
    RMSE: 0.5883819817701395
    MAE: 0.47297138652351756
    R²: 0.5886492560728201
  Target: Calculated_AQI
    RMSE: 84.53



In [6]:
import joblib
import json
import sklearn
import numpy as np
from io import BytesIO

S3_MODEL_KEY = "models/best_model.pkl"
S3_METADATA_KEY = "models/best_model_metadata.json"

def upload_model_to_s3(model, bucket_name, s3_client):
    # --- Save model to BytesIO buffer ---
    model_buffer = BytesIO()
    joblib.dump(model, model_buffer)
    model_buffer.seek(0)
    s3_client.upload_fileobj(model_buffer, Bucket=bucket_name, Key=S3_MODEL_KEY)
    print(f"✅ Model uploaded to s3://{bucket_name}/{S3_MODEL_KEY}")

    # --- Save version metadata ---
    metadata = {
        "sklearn_version": sklearn.__version__,
        "numpy_version": np.__version__,
        "model_type": type(model).__name__,
    }

    metadata_buffer = BytesIO()
    metadata_buffer.write(json.dumps(metadata).encode("utf-8"))
    metadata_buffer.seek(0)
    s3_client.upload_fileobj(metadata_buffer, Bucket=bucket_name, Key=S3_METADATA_KEY)
    print(f"✅ Metadata uploaded to s3://{bucket_name}/{S3_METADATA_KEY}")


In [7]:
upload_model_to_s3(best_model, bucket_name, s3)

✅ Model uploaded to s3://my-feature-store-data/models/best_model.pkl
✅ Metadata uploaded to s3://my-feature-store-data/models/best_model_metadata.json
