In [None]:
import boto3
import pandas as pd
from io import StringIO
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
# Your AWS credentials and bucket info
bucket_name = 'my-feature-store-data'
s3_key = 'pipeline-data/data.csv'  # Example: "pipeline-data/data.csv"

# Create an S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id= os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
)

# Fetch the object from S3
response = s3.get_object(Bucket=bucket_name, Key=s3_key)

# Read the CSV content
csv_data = response['Body'].read().decode('utf-8')

# Convert to DataFrame
df = pd.read_csv(StringIO(csv_data))

# Done!
print(df.isnull().sum())

index                         0
aqi_index                     0
aqi_timestamp                 0
co                            0
no                            0
no2                           0
o3                            0
so2                           0
pm2_5                         0
pm10                          0
nh3                           0
temperature_2m                0
relative_humidity_2m          0
precipitation                 0
wind_speed_10m                0
wind_direction_10m            0
surface_pressure              0
dew_point_2m                  0
apparent_temperature          0
shortwave_radiation           0
et0_fao_evapotranspiration    0
dtype: int64


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming df is already loaded
# Step 1: Convert datetime columns
df["aqi_timestamp"] = pd.to_datetime(df["aqi_timestamp"], errors="coerce")


# Step 2: Extract datetime features
df["aqi_year"] = df["aqi_timestamp"].dt.year
df["aqi_month"] = df["aqi_timestamp"].dt.month
df["aqi_day"] = df["aqi_timestamp"].dt.day
df["aqi_hour"] = df["aqi_timestamp"].dt.hour
df["aqi_minute"] = df["aqi_timestamp"].dt.minute



# Step 3: Drop original datetime columns
df = df.drop(columns=["aqi_timestamp"], errors="ignore")

# Step 4: Define target and features
target = df["aqi_index"]
X = df.drop(columns=["aqi_index"])

# Step 5: Final check for datetime columns
X = X.select_dtypes(exclude=["datetime64[ns]"])

# Step 6: Split
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)


In [4]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import joblib

In [5]:
# ✅ Step 3: Define Models
models = {
    "Random_Forest": RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42),
    "Gradient_Boosting": GradientBoostingRegressor(n_estimators=300, max_depth=3, random_state=42),
    "Linear_Regression": LinearRegression(),
    "Ridge_Regression": Ridge(alpha=1.0),
    "Support_Vector_Regressor": SVR(),
    "Neural_Network": MLPRegressor(max_iter=200, random_state=42)
}

# -----------------------
# ✅ Step 4: Train and Evaluate
results = []
best_model = None
best_model_name = None
best_rmse = float("inf")

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        "Model": model_name,
        "RMSE": rmse,
        "MAE": mae,
        "R²": r2
    })

    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_model_name = model_name

    print(f"Model: {model_name}")
    print(f"  RMSE: {rmse}")
    print(f"  MAE: {mae}")
    print(f"  R²: {r2}\n")

# -----------------------
#  Step 5: Save Best Model Locally
print(f"\n Best model: {best_model_name} (RMSE = {best_rmse:.2f})")
joblib.dump(best_model, "best_model.pkl")

S3_KEY = "models/best_model.pkl"  # Example: "models/best_model.pkl"

# Upload
s3.upload_file("best_model.pkl", bucket_name, S3_KEY)
print(f"Model uploaded to s3://{bucket_name}/{S3_KEY}")

# -----------------------
# Step 7: Summary
results_df = pd.DataFrame(results)
print("\n Summary of Model Performance:")
print(results_df)



Model: Random_Forest
  RMSE: 0.042412424872746865
  MAE: 0.006712250331479037
  R²: 0.9979231555689491





Model: Gradient_Boosting
  RMSE: 0.04659467797520641
  MAE: 0.014019899770630643
  R²: 0.9974933691016691

Model: Linear_Regression
  RMSE: 0.5732028490884581
  MAE: 0.45661327275031893
  R²: 0.6206549334666698

Model: Ridge_Regression
  RMSE: 0.5750599692537701
  MAE: 0.4582451847284113
  R²: 0.6181928709995748





Model: Support_Vector_Regressor
  RMSE: 0.4289844237705251
  MAE: 0.2966944096660549
  R²: 0.7875284400550668





Model: Neural_Network
  RMSE: 2.6272656508629004
  MAE: 1.9501080885467714
  R²: -6.969402014984009


 Best model: Random_Forest (RMSE = 0.04)
Model uploaded to s3://my-feature-store-data/models/best_model.pkl

 Summary of Model Performance:
                      Model      RMSE       MAE        R²
0             Random_Forest  0.042412  0.006712  0.997923
1         Gradient_Boosting  0.046595  0.014020  0.997493
2         Linear_Regression  0.573203  0.456613  0.620655
3          Ridge_Regression  0.575060  0.458245  0.618193
4  Support_Vector_Regressor  0.428984  0.296694  0.787528
5            Neural_Network  2.627266  1.950108 -6.969402
