In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import boto3
import os
from dotenv import load_dotenv

# Load your datasets
load_dotenv()
# --- Step 1: Load Data from AWS S3 ---
# Your AWS credentials and bucket info
bucket_name = 'my-feature-store-data'
s3_key = 'pipeline-data/data.csv'  # Example: "pipeline-data/data.csv"

# Create an S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id= os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
)
bucket_name = 'my-feature-store-data'  # replace with your actual S3 bucket
data_key = 'pipeline-data/data.csv'  # replace with your actual file key

# Read CSV file from S3
obj = s3.get_object(Bucket=bucket_name, Key=data_key)
df = pd.read_csv(obj['Body'])
# Process date columns
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
df = df.dropna(subset=['aqi_index', 'Calculated_AQI'])  # Remove rows with missing target values

# Extract features and targets
target_columns = ['aqi_index', 'Calculated_AQI']
date_columns = ['year', 'month', 'day', 'hour']
features = [col for col in df.columns if col not in target_columns and col != 'date']

X = df[features]
y = df[target_columns]

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Step 2: Load the pre-trained multioutput model ---
# Option 1: Load from S3
try:
    model_key = 'models/best_model.pkl'
    model_buffer = BytesIO()
    s3.download_fileobj(Bucket=bucket_name, Key=model_key, Fileobj=model_buffer)
    model_buffer.seek(0)
    model = joblib.load(model_buffer)
    print("Loaded pre-trained model from S3")
except Exception as e:
    print(f"Could not load model from S3: {e}")
    # Option 2: Train a new model
    print("Training a new model...")
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.multioutput import MultiOutputRegressor
    from sklearn.preprocessing import StandardScaler
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)

# Calculate metrics for each target
print("Model Performance:")
for i, col in enumerate(target_columns):
    rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
    
    print(f"  Target: {col}")
    print(f"    RMSE: {rmse}")
    print(f"    MAE: {mae}")
    print(f"    R²: {r2}")

# --- Step 3: Predict AQI for Next 3 Days ---
start_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
future_dates = [start_date + timedelta(days=i) for i in range(3)]

# Create base future features with date components
future_features = pd.DataFrame({
    "year": [d.year for d in future_dates],
    "month": [d.month for d in future_dates],
    "day": [d.day for d in future_dates],
    "hour": [12] * 3,  # Prediction for noon each day
})

# Find most recent data in April 2025
recent_data = df[(df['year'] == 2025) & (df['month'] == 4)]

# If we have April 2025 data, use it for predictions
if len(recent_data) > 0:
    # Sort by date to get most recent entries
    recent_data = recent_data.sort_values(['year', 'month', 'day', 'hour'])
    
    # Get latest 72 hours (3 days) of data
    recent_data = recent_data.tail(72)
    
    # Calculate average value for each numerical feature except date features
    numeric_features = [col for col in features if col not in date_columns]
    recent_averages = recent_data[numeric_features].mean().to_dict()
    
    # Apply these values to future predictions
    for feature, value in recent_averages.items():
        future_features[feature] = value
else:
    # If no recent data, find data from similar season/month from previous years
    similar_season_data = df[df['month'] == 4]  # April data from any year
    
    if len(similar_season_data) > 0:
        # Calculate averages for similar season
        numeric_features = [col for col in features if col not in date_columns]
        season_averages = similar_season_data[numeric_features].mean().to_dict()
        
        # Apply these values to future predictions
        for feature, value in season_averages.items():
            future_features[feature] = value
    else:
        # Fallback: use most recent data from any month
        recent_any = df.sort_values(['year', 'month', 'day']).tail(72)
        numeric_features = [col for col in features if col not in date_columns]
        any_averages = recent_any[numeric_features].mean().to_dict()
        
        # Apply these values to future predictions
        for feature, value in any_averages.items():
            future_features[feature] = value

# Adjust some features based on forecasted weather patterns (simplified)
# For example, consecutive days often show patterns
for i, date in enumerate(future_dates):
    if i > 0:  # Skip the first day
        # Add small variations to simulate weather changes
        for feature in ['temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'wind_speed_10m', 'wind_direction_10m', 'surface_pressure']:
            if feature in future_features.columns:
                # Add +/- 5% random variation
                variation = np.random.uniform(-0.05, 0.05)
                future_features.loc[i, feature] = future_features.loc[i-1, feature] * (1 + variation)

# Make sure all required columns are present
required_columns = X_train.columns.tolist()
for col in required_columns:
    if col not in future_features.columns:
        # If a column is missing, use the mean from training data
        if col in X_train.columns:
            future_features[col] = X_train[col].mean()
        else:
            future_features[col] = 0

# Ensure columns are in the right order
future_features = future_features[required_columns]

# Scale features and predict
future_scaled = scaler.transform(future_features)
predictions = model.predict(future_scaled)

# Display results
prediction_results = pd.DataFrame({
    "Date": [d.strftime("%Y-%m-%d") for d in future_dates],
    "Predicted_AQI": np.round(predictions[:, 0], 2),
    "Predicted_Calculated_AQI": np.round(predictions[:, 1], 2)
})

print("\nPredicted values for the next 3 days:")
print(prediction_results)

# Optional: Analysis of feature importance to understand the model
if hasattr(model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_,
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 most important features:")
    print(feature_importance.head(10))
elif hasattr(model, 'estimators_'):
    # For MultiOutputRegressor, access the first estimator's feature importances
    estimator = model.estimators_[0]
    if hasattr(estimator, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': estimator.feature_importances_,
        }).sort_values('Importance', ascending=False)
        
        print("\nTop 10 most important features (from first target model):")
        print(feature_importance.head(10))

# --- Optional: Save retrained model back to S3 ---
model_buffer = BytesIO()
joblib.dump(model, model_buffer)
model_buffer.seek(0)

s3.upload_fileobj(model_buffer, Bucket=bucket_name, Key='models/retrained_multioutput_model.pkl')
print("Retrained model saved to S3.")

# Save `prediction_results` to a file
prediction_results.to_pickle("prediction_results.pkl")

Loaded pre-trained model from S3
Model Performance:
  Target: aqi_index
    RMSE: 2.0081223477374364
    MAE: 1.8101914399194654
    R²: -3.7995604949847666
  Target: Calculated_AQI
    RMSE: 124.77295563352484
    MAE: 78.74878941503074
    R²: -0.4680402598736384

Predicted values for the next 3 days:
         Date  Predicted_AQI  Predicted_Calculated_AQI
0  2025-04-29           2.31                    177.63
1  2025-04-30           2.31                    177.63
2  2025-05-01           2.31                    177.63

Top 10 most important features (from first target model):
                 Feature  Importance
6                  pm2_5    0.873170
7                   pm10    0.107621
4                     o3    0.016397
0                  index    0.001109
8                    nh3    0.000292
21                   day    0.000252
5                    so2    0.000236
3                    no2    0.000182
22                  hour    0.000169
10  relative_humidity_2m    0.000146




Retrained model saved to S3.


In [3]:
# Save `prediction_results` to a file
prediction_results.to_pickle("prediction_results.pkl")
