# Weather Model Training Notebook

This notebook implements the training pipeline for weather prediction models based on the methodology described in the project documentation.

## 1. Environment Setup and Library Loading

**Goal:** Import necessary libraries for data processing, model training, evaluation, and visualization.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import warnings

warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

## 2. Data Collection and Loading

**Goal:** Load the historical dataset.
**Source:** `../../data_collections/datasets/historical_data_hourly.csv`

In [None]:
# Load dataset
file_path = '../../data_collections/datasets/historical_data_hourly.csv'
df = pd.read_csv(file_path)

# Display first few rows and info
print(f"Total records: {len(df)}")
display(df.head())
display(df.info())

## 3. Data Preprocessing

**Goal:** Clean, standardize, and prepare data for regression training.
**Steps:**
- Select features
- Format date/time
- Label Encode categorical variables (weather condition)

In [None]:
# Construct timestamp from year, month, day, hour columns
if {'year', 'month', 'day', 'hour'}.issubset(df.columns):
    df['timestamp'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
elif 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])

# Handle missing values (if any)
df = df.dropna()

# Rename columns to match the notebook's expected names
column_mapping = {
    'temp': 'temperature',
    'windspeed': 'wind_speed',
    'sealevelpressure': 'pressure'
}
df = df.rename(columns=column_mapping)

# Label Encoding for 'conditions'
le = LabelEncoder()

if 'conditions' in df.columns and df['conditions'].dtype == 'object':
    df['condition_encoded'] = le.fit_transform(df['conditions'])
    print("Encoded 'conditions' to 'condition_encoded'")

print("Data preprocessing complete. Columns:", df.columns.tolist())
display(df.head())

## 4. Model Training and Comparison

**Goal:** Train and compare four regression models.
**Models:** Linear Regression, Decision Tree, KNN, Random Forest.
**Target:** We will test predicting **Temperature** as a primary example for comparison, or average performance across targets.
**Overfitting Prevention:** Added Cross-Validation to ensure model stability.

In [None]:
# Define Features (X) and Target (y)
df['hour'] = df['timestamp'].dt.hour
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day

X = df[['hour', 'month', 'day']]
if 'condition_encoded' in df.columns:
    X = df[['hour', 'month', 'day', 'condition_encoded']]

# Target: Temperature (as representative for model comparison)
y_temp = df['temperature']

def evaluate_models(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=random_state),
        "KNN": KNeighborsRegressor(),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=random_state)
    }
    
    results = []
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        # Cross-Validation (5-fold)
        cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
        cv_mean = cv_scores.mean()
        cv_std = cv_scores.std()
        
        results.append({
            "Model": name,
            "MSE": mse,
            "MAE": mae,
            "RMSE": rmse,
            "R2 Score": r2,
            "CV R2 Mean": cv_mean,
            "CV R2 Std": cv_std
        })
        
    return pd.DataFrame(results)

print("Comparing models for Temperature prediction (with Cross-Validation)...")
comparison_results = evaluate_models(X, y_temp)
display(comparison_results)

## 5. Analysis of Results and Individual Parameter Performance

**Goal:** Evaluate the best model (Random Forest) on all individual parameters: Temperature, Humidity, Wind Speed, Pressure.
**Overfitting Prevention:** 
- Compare Training R2 vs Testing R2.
- Use Hyperparameter Tuning (GridSearchCV) to find optimal settings.

In [None]:
targets = ['temperature', 'humidity', 'wind_speed', 'pressure']
rf_results = []
best_models = {}

# Hyperparameter Grid for Random Forest
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

for target in targets:
    print(f"\nOptimizing model for {target}...")
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Base model
    rf = RandomForestRegressor(random_state=42)
    
    # Grid Search
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
    grid_search.fit(X_train, y_train)
    
    best_rf = grid_search.best_estimator_
    print(f"Best Parameters for {target}: {grid_search.best_params_}")
    
    # Save model for later
    best_models[target] = best_rf
    
    # Predictions
    y_pred_test = best_rf.predict(X_test)
    y_pred_train = best_rf.predict(X_train)
    
    # Metrics
    mae = mean_absolute_error(y_test, y_pred_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    r2_test = r2_score(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    
    # Check for overfitting
    overfitting_gap = r2_train - r2_test
    status = "Good" if overfitting_gap < 0.1 else "Potential Overfitting"
    
    rf_results.append({
        "Parameter": target.capitalize(),
        "MAE": mae,
        "RMSE": rmse,
        "Test R2": r2_test,
        "Train R2": r2_train,
        "Gap": overfitting_gap,
        "Status": status
    })

rf_results_df = pd.DataFrame(rf_results)
display(rf_results_df)

# Visualizing Performance
plt.figure(figsize=(12, 6))
sns.barplot(x='Parameter', y='Test R2', data=rf_results_df, palette='viridis')
plt.title('Optimized Random Forest Performance (Test R2 Score)')
plt.ylim(0, 1)
plt.show()

## 6. Saving the Best Model

**Goal:** Save the trained Random Forest models for backend use into a **single file**.

In [None]:
import os

save_dir = '../models'
os.makedirs(save_dir, exist_ok=True)

# Save all models in a single dictionary
filename = f"{save_dir}/weather_prediction_models.pkl"
joblib.dump(best_models, filename)

print(f"Saved all models to {filename}")

## 7. Visualization: Actual vs. Predicted

**Goal:** Visualize prediction accuracy for January 2020 (Daily Aggregation).

In [None]:
# Filter data for January 2020
jan_2020 = df[(df['timestamp'].dt.year == 2020) & (df['timestamp'].dt.month == 1)].copy()

if len(jan_2020) == 0:
    print("No data found for Jan 2020. Using first month of available data for visualization.")
    # Fallback to first month in data
    first_date = df['timestamp'].min()
    jan_2020 = df[(df['timestamp'].dt.year == first_date.year) & (df['timestamp'].dt.month == first_date.month)].copy()

# Aggregate daily
daily_data = jan_2020.resample('D', on='timestamp').mean().dropna()

# Prepare X for daily data (needs same features as training)
daily_data['hour'] = 12 # Assume mid-day for daily avg prediction or just use features available
daily_data['month'] = daily_data.index.month
daily_data['day'] = daily_data.index.day
# Note: 'condition_encoded' might need mode aggregation, but mean is used here for simplicity or re-encoding
if 'condition_encoded' in daily_data.columns:
    daily_data['condition_encoded'] = daily_data['condition_encoded'].round().astype(int)

X_daily = daily_data[['hour', 'month', 'day']]
if 'condition_encoded' in df.columns:
    X_daily = daily_data[['hour', 'month', 'day', 'condition_encoded']]

# Plotting
fig, axes = plt.subplots(4, 1, figsize=(12, 20))

for i, target in enumerate(targets):
    # Predict using the saved best model
    model = best_models[target]
    y_pred_daily = model.predict(X_daily)
    
    ax = axes[i]
    ax.plot(daily_data.index, daily_data[target], label='Actual', color='blue', marker='o')
    ax.plot(daily_data.index, y_pred_daily, label='Predicted', color='red', linestyle='--', marker='x')
    
    ax.set_title(f'Actual vs Predicted {target.capitalize()} (Jan 2020)')
    ax.set_xlabel('Date')
    ax.set_ylabel(target.capitalize())
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()

## 8. Visualization: Incremental Data Impact

**Goal:** Show how R2 score improves with more data points.

In [None]:
data_fractions = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
r2_scores = []

# Target for this test: Temperature
y = df['temperature']

for frac in data_fractions:
    # Sample fraction of data
    subset = df.sample(frac=frac, random_state=42)
    X_sub = subset[['hour', 'month', 'day']]
    if 'condition_encoded' in df.columns:
        X_sub = subset[['hour', 'month', 'day', 'condition_encoded']]
    y_sub = subset['temperature']
    
    X_train, X_test, y_train, y_test = train_test_split(X_sub, y_sub, test_size=0.2, random_state=42)
    
    # Use a simpler model or best params from before
    rf = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    r2_scores.append(r2_score(y_test, y_pred))

plt.figure(figsize=(10, 6))
plt.plot(data_fractions, r2_scores, marker='o', color='green')
plt.title('Impact of Incremental Data on Model Performance (Temperature)')
plt.xlabel('Fraction of Data Used')
plt.ylabel('R2 Score')
plt.grid(True)
plt.show()