In [1]:
import boto3
import joblib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
# --- Step 1: Load Data from AWS S3 ---
# Your AWS credentials and bucket info
bucket_name = 'my-feature-store-data'
s3_key = 'pipeline-data/data.csv'  # Example: "pipeline-data/data.csv"

# Create an S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id= os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
)
bucket_name = 'my-feature-store-data'  # replace with your actual S3 bucket
data_key = 'pipeline-data/data.csv'  # replace with your actual file key

# Read CSV file from S3
obj = s3.get_object(Bucket=bucket_name, Key=data_key)
data = pd.read_csv(obj['Body'])

print("Data fetched successfully from AWS S3!")

# Separate features and target
target = data["aqi_index"]
features = data.drop(columns=["aqi_index"])

# --- Step 3: Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# --- Step 4: Apply Feature Scaling ---
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Step 5: Load Best Model from S3 ---
model_key = 'models/best_model.pkl'  # replace with your model path
model_obj = s3.get_object(Bucket=bucket_name, Key=model_key)
model = joblib.load(BytesIO(model_obj['Body'].read()))

print("Model loaded successfully from AWS S3!")

# --- Step 6: Retrain the Model ---
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model performance on test dataset:")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R²: {r2}")

# --- Step 7: Predict AQI for Next 3 Days ---
start_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
future_dates = [start_date + timedelta(days=i) for i in range(3)]

future_features = pd.DataFrame({
    "date_year": [d.year for d in future_dates],
    "date_month": [d.month for d in future_dates],
    "date_day": [d.day for d in future_dates],
    "date_hour": [12] * 3,
    "date_minute": [0] * 3,
    "date_timestamp": [int(d.timestamp()) for d in future_dates]
})

# Add missing features
required_columns = X_train.columns.tolist()
for col in required_columns:
    if col not in future_features.columns:
        future_features[col] = 0
future_features = future_features[required_columns]

future_scaled = scaler.transform(future_features)
predicted_aqi = model.predict(future_scaled)

prediction_results = pd.DataFrame({
    "Date": [d.strftime("%Y-%m-%d") for d in future_dates],
    "Predicted_AQI": np.round(predicted_aqi, 2)
})

print("Predicted AQI for the next 3 days:")
print(prediction_results)

# --- Optional: Save retrained model back to S3 ---
model_buffer = BytesIO()
joblib.dump(model, model_buffer)
model_buffer.seek(0)

s3.upload_fileobj(model_buffer, Bucket=bucket_name, Key='models/retrained_model.pkl')
print("Retrained model saved to S3.")


Data fetched successfully from AWS S3!
Model loaded successfully from AWS S3!




Model performance on test dataset:
  RMSE: 0.04267408896725668
  MAE: 0.005880291497932563
  R²: 0.9978325472035963
Predicted AQI for the next 3 days:
         Date  Predicted_AQI
0  2025-04-28            2.0
1  2025-04-29            2.0
2  2025-04-30            2.0
Retrained model saved to S3.


In [2]:
# Save `prediction_results` to a file
prediction_results.to_pickle("prediction_results.pkl")
