<a href="https://colab.research.google.com/github/BRUTE18/Aerial_Image_Segementation/blob/main/Untitled64.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""
Ola Bike Ride Request Forecast using Machine Learning
---------------------------------------------------
A comprehensive implementation of the research paper by Anunay Kumar and Utkarsh Raj
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import holidays
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten
import warnings
warnings.filterwarnings('ignore')

class OlaBikeForecast:
    """
    A class to implement the Ola Bike Ride Request Forecasting system
    as described in the research paper.
    """

    def __init__(self, cities=None):
        """
        Initialize the forecasting system.

        Parameters:
        -----------
        cities : list, optional
            List of cities to include in the analysis. Default is
            ["Bangalore", "Mumbai", "Hyderabad", "Pune", "Delhi"]
        """
        self.cities = cities or ["Bangalore", "Mumbai", "Hyderabad", "Pune", "Delhi"]
        self.models = {}
        self.preprocessors = {}
        self.india_holidays = holidays.India()
        self.zone_info = None
        self.event_data = None
        self.traffic_data = None
        self.weather_data = None
        self.ride_data = None

    def load_data(self, ride_data_path, weather_data_path=None,
                event_data_path=None, traffic_data_path=None, zone_info_path=None):
        """
        Load all required datasets for the forecasting system.

        Parameters:
        -----------
        ride_data_path : str
            Path to the ride request data CSV.
        weather_data_path : str, optional
            Path to the weather data CSV.
        event_data_path : str, optional
            Path to the event data CSV.
        traffic_data_path : str, optional
            Path to the traffic data CSV.
        zone_info_path : str, optional
            Path to the zone information CSV.
        """
        print("Loading ride data...")
        self.ride_data = pd.read_csv(ride_data_path)
        self.ride_data['timestamp'] = pd.to_datetime(self.ride_data['timestamp'])

        if weather_data_path:
            print("Loading weather data...")
            self.weather_data = pd.read_csv(weather_data_path)
            self.weather_data['timestamp'] = pd.to_datetime(self.weather_data['timestamp'])

        if event_data_path:
            print("Loading event data...")
            self.event_data = pd.read_csv(event_data_path)
            self.event_data['event_date'] = pd.to_datetime(self.event_data['event_date'])

        if traffic_data_path:
            print("Loading traffic data...")
            self.traffic_data = pd.read_csv(traffic_data_path)
            self.traffic_data['timestamp'] = pd.to_datetime(self.traffic_data['timestamp'])

        if zone_info_path:
            print("Loading zone information...")
            self.zone_info = pd.read_csv(zone_info_path)

        print("Data loading complete!")

    def generate_dummy_data(self, start_date='2022-01-01', end_date='2023-06-30',
                           num_zones=50, seed=42):
        """
        Generate synthetic data for demonstration purposes when real data is not available.

        Parameters:
        -----------
        start_date : str
            Start date for the synthetic data.
        end_date : str
            End date for the synthetic data.
        num_zones : int
            Number of geographical zones to generate per city.
        seed : int
            Random seed for reproducibility.
        """
        np.random.seed(seed)
        print("Generating synthetic data for demonstration...")

        # Create date range
        date_range = pd.date_range(start=start_date, end=end_date, freq='30min')

        # Generate zone IDs for each city
        all_zones = []
        for city in self.cities:
            for i in range(1, num_zones + 1):
                all_zones.append(f"{city}_zone_{i}")

        # Create empty ride data dataframe
        ride_data = []

        # Generate synthetic patterns
        for zone in all_zones:
            city = zone.split('_')[0]
            zone_num = int(zone.split('_')[-1])

            # Different baseline demand for different cities and zones
            base_demand = np.random.randint(10, 40)

            # Zone type affects the pattern
            zone_type = np.random.choice(['residential', 'commercial', 'industrial', 'transportation_hub'], p=[0.5, 0.3, 0.1, 0.1])

            for timestamp in date_range:
                hour = timestamp.hour
                day_of_week = timestamp.dayofweek
                month = timestamp.month

                # Base demand for this time
                demand = base_demand

                # Hour of day effect (rush hours)
                if 8 <= hour <= 10:  # Morning rush
                    demand *= 1.5
                elif 17 <= hour <= 19:  # Evening rush
                    demand *= 1.7
                elif 0 <= hour <= 5:  # Late night
                    demand *= 0.3

                # Day of week effect
                if day_of_week >= 5:  # Weekend
                    if zone_type == 'commercial':
                        demand *= 1.2
                    elif zone_type == 'residential':
                        demand *= 0.8
                    elif zone_type == 'industrial':
                        demand *= 0.4

                # Monthly seasonality
                if month in [6, 7, 8]:  # Rainy season
                    demand *= 0.85
                elif month in [11, 12]:  # Festival season
                    demand *= 1.2

                # Add some randomness
                demand = int(demand * np.random.normal(1, 0.15))
                demand = max(0, demand)  # Ensure non-negative

                # Create ride entry
                ride_data.append({
                    'timestamp': timestamp,
                    'city': city,
                    'zone_id': zone,
                    'zone_type': zone_type,
                    'ride_requests': demand,
                    'completed_rides': int(demand * np.random.uniform(0.8, 0.95)),
                    'latitude': np.random.uniform(10, 30),
                    'longitude': np.random.uniform(70, 90)
                })

        # Convert to DataFrame
        self.ride_data = pd.DataFrame(ride_data)

        # Generate weather data
        weather_data = []
        for city in self.cities:
            for timestamp in pd.date_range(start=start_date, end=end_date, freq='1H'):
                month = timestamp.month

                # Seasonal temperature variations
                if month in [12, 1, 2]:  # Winter
                    temp_base = np.random.uniform(10, 20)
                elif month in [3, 4, 5]:  # Summer
                    temp_base = np.random.uniform(25, 40)
                elif month in [6, 7, 8, 9]:  # Monsoon
                    temp_base = np.random.uniform(20, 30)
                else:  # Autumn
                    temp_base = np.random.uniform(18, 28)

                # Daily temperature cycle
                hour = timestamp.hour
                if 0 <= hour <= 6:
                    temp_adj = -2
                elif 7 <= hour <= 10:
                    temp_adj = 0
                elif 11 <= hour <= 16:
                    temp_adj = 3
                else:
                    temp_adj = 0

                temperature = temp_base + temp_adj

                # Precipitation (more likely in monsoon)
                if month in [6, 7, 8, 9]:
                    precipitation_prob = 0.3
                else:
                    precipitation_prob = 0.05

                precipitation = 0
                weather_condition = 'Clear'

                if np.random.random() < precipitation_prob:
                    precipitation = np.random.exponential(5)
                    if precipitation > 10:
                        weather_condition = 'Heavy Rain'
                    elif precipitation > 2:
                        weather_condition = 'Light Rain'
                    else:
                        weather_condition = 'Drizzle'

                # Humidity tends to be higher with precipitation and in monsoon
                if precipitation > 0:
                    humidity = np.random.uniform(70, 95)
                elif month in [6, 7, 8, 9]:
                    humidity = np.random.uniform(60, 85)
                else:
                    humidity = np.random.uniform(30, 70)

                weather_data.append({
                    'timestamp': timestamp,
                    'city': city,
                    'temperature': temperature,
                    'precipitation': precipitation,
                    'humidity': humidity,
                    'wind_speed': np.random.uniform(0, 15),
                    'weather_condition': weather_condition
                })

        self.weather_data = pd.DataFrame(weather_data)

        # Generate event data
        event_data = []
        event_types = ['Concert', 'Sports Match', 'Festival', 'Conference', 'Exhibition']
        event_sizes = ['Small', 'Medium', 'Large']

        # Create ~500 events spread across the date range and cities
        for _ in range(500):
            event_date = pd.Timestamp(np.random.choice(date_range)).normalize()
            duration_hours = np.random.choice([3, 4, 6, 8, 12])
            city = np.random.choice(self.cities)
            zone_id = f"{city}_zone_{np.random.randint(1, num_zones + 1)}"

            event_data.append({
                'event_date': event_date,
                'start_time': pd.Timestamp(event_date) + pd.Timedelta(hours=np.random.randint(8, 20)),
                'duration_hours': duration_hours,
                'city': city,
                'zone_id': zone_id,
                'event_type': np.random.choice(event_types),
                'event_size': np.random.choice(event_sizes, p=[0.5, 0.3, 0.2]),
                'estimated_attendance': np.random.randint(100, 10000)
            })

        self.event_data = pd.DataFrame(event_data)

        # Generate zone information
        zone_info = []
        for city in self.cities:
            for i in range(1, num_zones + 1):
                zone_id = f"{city}_zone_{i}"
                zone_type = np.random.choice(['residential', 'commercial', 'industrial', 'transportation_hub'], p=[0.5, 0.3, 0.1, 0.1])

                # Population density varies by zone type
                if zone_type == 'residential':
                    pop_density = np.random.uniform(5000, 20000)
                elif zone_type == 'commercial':
                    pop_density = np.random.uniform(1000, 8000)
                elif zone_type == 'industrial':
                    pop_density = np.random.uniform(500, 3000)
                else:  # transportation hub
                    pop_density = np.random.uniform(2000, 10000)

                zone_info.append({
                    'zone_id': zone_id,
                    'city': city,
                    'zone_type': zone_type,
                    'area_sqkm': np.random.uniform(0.8, 1.2),
                    'population_density': pop_density,
                    'poi_density': np.random.uniform(10, 100),
                    'avg_income_level': np.random.choice(['low', 'medium', 'high']),
                    'has_metro_station': np.random.choice([True, False], p=[0.3, 0.7]),
                    'has_train_station': np.random.choice([True, False], p=[0.2, 0.8]),
                    'has_bus_terminal': np.random.choice([True, False], p=[0.4, 0.6]),
                    'center_latitude': np.random.uniform(10, 30),
                    'center_longitude': np.random.uniform(70, 90)
                })

        self.zone_info = pd.DataFrame(zone_info)

        # Generate traffic data
        traffic_data = []
        for city in self.cities:
            for timestamp in pd.date_range(start=start_date, end=end_date, freq='1H'):
                hour = timestamp.hour
                day_of_week = timestamp.dayofweek

                # Base congestion level varies by time
                if 8 <= hour <= 10 or 17 <= hour <= 19:  # Rush hours
                    base_congestion = np.random.uniform(0.6, 0.9)
                elif 0 <= hour <= 5:  # Late night
                    base_congestion = np.random.uniform(0.1, 0.3)
                else:
                    base_congestion = np.random.uniform(0.3, 0.6)

                # Weekday vs weekend
                if day_of_week >= 5:  # Weekend
                    base_congestion *= 0.7

                traffic_data.append({
                    'timestamp': timestamp,
                    'city': city,
                    'congestion_level': base_congestion,
                    'avg_speed_kmph': 60 * (1 - base_congestion) + 10  # Speed decreases with congestion
                })

        self.traffic_data = pd.DataFrame(traffic_data)

        print("Synthetic data generation complete!")

    def preprocess_data(self, time_interval='30min', test_size=0.2):
        """
        Preprocess the data to create features for forecasting.

        Parameters:
        -----------
        time_interval : str
            Time interval for aggregating ride requests.
        test_size : float
            Fraction of data to use for testing.

        Returns:
        --------
        dict
            Dictionary containing training and testing DataFrames for each city.
        """
        print("Preprocessing data...")

        # Ensure ride data is sorted by timestamp
        self.ride_data = self.ride_data.sort_values(['city', 'zone_id', 'timestamp'])

        # Aggregate ride requests at the specified interval
        agg_df = self.ride_data.set_index('timestamp').groupby([
            pd.Grouper(freq=time_interval), 'city', 'zone_id'
        ])['ride_requests'].sum().reset_index()

        # Create temporal features
        agg_df['hour'] = agg_df['timestamp'].dt.hour
        agg_df['day_of_week'] = agg_df['timestamp'].dt.dayofweek
        agg_df['day_of_month'] = agg_df['timestamp'].dt.day
        agg_df['month'] = agg_df['timestamp'].dt.month
        agg_df['year'] = agg_df['timestamp'].dt.year
        agg_df['is_weekend'] = agg_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

        # Add holiday indicators
        agg_df['is_holiday'] = agg_df['timestamp'].dt.date.apply(
            lambda x: 1 if x in self.india_holidays else 0
        )

        # Merge with zone information if available
        if self.zone_info is not None:
            agg_df = pd.merge(agg_df, self.zone_info, on=['city', 'zone_id'], how='left')

        # Process and merge weather data if available
        if self.weather_data is not None:
            # Prepare weather data
            weather_df = self.weather_data.copy()

            # Join weather data to ride data based on nearest timestamp and city
            agg_df = pd.merge_asof(
                agg_df.sort_values('timestamp'),
                weather_df.sort_values('timestamp')[['timestamp', 'city', 'temperature', 'precipitation', 'humidity', 'wind_speed', 'weather_condition']],
                on='timestamp',
                by='city',
                direction='nearest'
            )

            # One-hot encode weather conditions
            agg_df = pd.get_dummies(agg_df, columns=['weather_condition'], prefix='weather')

        # Process and merge event data if available
        if self.event_data is not None:
            # Prepare event features for each timestamp and zone
            event_features = []

            for _, row in agg_df.iterrows():
                city = row['city']
                zone_id = row['zone_id']
                ts = row['timestamp']

                # Find events happening in this zone around this time
                relevant_events = self.event_data[
                    (self.event_data['city'] == city) &
                    (self.event_data['zone_id'] == zone_id) &
                    (ts >= self.event_data['start_time']) &
                    (ts <= self.event_data['start_time'] + pd.to_timedelta(self.event_data['duration_hours'], unit='h'))
                ]

                has_event = len(relevant_events) > 0
                event_size = 'None'
                estimated_attendance = 0

                if has_event:
                    # If multiple events, take the largest one
                    largest_event = relevant_events.loc[relevant_events['estimated_attendance'].idxmax()] if len(relevant_events) > 0 else None
                    if largest_event is not None:
                        event_size = largest_event['event_size']
                        estimated_attendance = largest_event['estimated_attendance']

                event_features.append({
                    'has_event': int(has_event),
                    'event_size_small': 1 if event_size == 'Small' else 0,
                    'event_size_medium': 1 if event_size == 'Medium' else 0,
                    'event_size_large': 1 if event_size == 'Large' else 0,
                    'estimated_attendance': estimated_attendance
                })

            event_df = pd.DataFrame(event_features)
            agg_df = pd.concat([agg_df, event_df], axis=1)

        # Process and merge traffic data if available
        if self.traffic_data is not None:
            # Join traffic data to ride data based on nearest timestamp and city
            agg_df = pd.merge_asof(
                agg_df.sort_values('timestamp'),
                self.traffic_data.sort_values('timestamp')[['timestamp', 'city', 'congestion_level', 'avg_speed_kmph']],
                on='timestamp',
                by='city',
                direction='nearest'
            )

        # Create lag features (previous hours/days demand)
        for lag in [1, 2, 3, 24, 48, 168]:  # 1hr, 2hr, 3hr, 1day, 2day, 1week
            agg_df[f'lag_{lag}'] = agg_df.groupby(['city', 'zone_id'])['ride_requests'].shift(lag)

        # Create rolling statistics
        for window in [3, 24, 168]:  # 3hr, 1day, 1week
            # Rolling mean
            agg_df[f'rolling_mean_{window}'] = agg_df.groupby(['city', 'zone_id'])['ride_requests'].transform(
                lambda x: x.shift(1).rolling(window=window, min_periods=1).mean()
            )
            # Rolling standard deviation
            agg_df[f'rolling_std_{window}'] = agg_df.groupby(['city', 'zone_id'])['ride_requests'].transform(
                lambda x: x.shift(1).rolling(window=window, min_periods=1).std()
            )

        # Drop rows with NaN values created by lagging
        agg_df = agg_df.dropna()

        # Prepare city-specific datasets
        datasets = {}

        for city in self.cities:
            city_df = agg_df[agg_df['city'] == city].copy()

            if len(city_df) == 0:
                print(f"No data available for {city}, skipping...")
                continue

            # Determine the split point
            split_idx = int(len(city_df) * (1 - test_size))

            # Split into training and testing sets (keeping time order)
            train_df = city_df.iloc[:split_idx]
            test_df = city_df.iloc[split_idx:]

            datasets[city] = {
                'train': train_df,
                'test': test_df,
                'all': city_df
            }

            print(f"Processed {city}: {len(train_df)} training samples, {len(test_df)} testing samples")

        print("Data preprocessing complete!")
        return datasets

    def build_feature_pipeline(self, df):
        """
        Build a preprocessing pipeline for the features.

        Parameters:
        -----------
        df : pandas.DataFrame
            DataFrame containing the features.

        Returns:
        --------
        sklearn.compose.ColumnTransformer
            Preprocessing pipeline for transforming features.
        """
        # Identify numeric and categorical columns
        numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = df.select_dtypes(include=['object', 'bool']).columns.tolist()

        # Remove target variable and timestamp from features
        if 'ride_requests' in numeric_features:
            numeric_features.remove('ride_requests')
        if 'timestamp' in numeric_features:
            numeric_features.remove('timestamp')
        if 'timestamp' in categorical_features:
            categorical_features.remove('timestamp')
        if 'city' in categorical_features:
            categorical_features.remove('city')
        if 'zone_id' in categorical_features:
            categorical_features.remove('zone_id')

        # Create preprocessing steps
        numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Create column transformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )

        return preprocessor

    def train_arima_model(self, train_df, zone_id, order=(5,1,0)):
        """
        Train an ARIMA model for a specific zone.

        Parameters:
        -----------
        train_df : pandas.DataFrame
            Training data.
        zone_id : str
            Zone identifier.
        order : tuple
            ARIMA order parameters (p,d,q).

        Returns:
        --------
        model : ARIMA model
            Trained ARIMA model.
        """
        # Filter data for the specific zone
        zone_data = train_df[train_df['zone_id'] == zone_id]['ride_requests']

        # Train ARIMA model
        model = ARIMA(zone_data, order=order)
        model_fit = model.fit()

        return model_fit

    def train_prophet_model(self, train_df, zone_id):
        """
        Train a Prophet model for a specific zone.

        Parameters:
        -----------
        train_df : pandas.DataFrame
            Training data.
        zone_id : str
            Zone identifier.

        Returns:
        --------
        model : Prophet model
            Trained Prophet model.
        """
        # Filter data for the specific zone
        zone_data = train_df[train_df['zone_id'] == zone_id][['timestamp', 'ride_requests']]

        # Rename columns for Prophet
        prophet_df = zone_data.rename(columns={'timestamp': 'ds', 'ride_requests': 'y'})

        # Initialize and train Prophet model
        model = Prophet(interval_width=0.95, daily_seasonality=True, weekly_seasonality=True)
        model.fit(prophet_df)

        return model

    def train_xgboost_model(self, train_df, preprocessor, target_col='ride_requests'):
        """
        Train an XGBoost model.

        Parameters:
        -----------
        train_df : pandas.DataFrame
            Training data.
        preprocessor : sklearn.compose.ColumnTransformer
            Feature preprocessing pipeline.
        target_col : str
            Target column name.

        Returns:
        --------
        model : XGBoost model
            Trained XGBoost model.
        """
        # Prepare the data
        X = train_df.drop(['ride_requests', 'timestamp', 'city', 'zone_id'], axis=1)
        y = train_df[target_col]

        # Preprocess features
        X_processed = preprocessor.fit_transform(X)

        # Train XGBoost model
        model = xgb.XGBRegressor(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=7,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='reg:squarederror',
            random_state=42
        )
        model.fit(X_processed, y)

        return model

    def train_random_forest_model(self, train_df, preprocessor, target_col='ride_requests'):
        """
        Train a Random Forest model.

        Parameters:
        -----------
        train_df : pandas.DataFrame
            Training data.
        preprocessor : sklearn.compose.ColumnTransformer
            Feature preprocessing pipeline.
        target_col : str
            Target column name.

        Returns:
        --------
        model : RandomForest model
            Trained Random Forest model.
        """
        # Prepare the data
        X = train_df.drop(['ride_requests', 'timestamp', 'city', 'zone_id'], axis=1)
        y = train_df[target_col]

        # Preprocess features
        X_processed = preprocessor.fit_transform(X)

        # Train Random Forest model
        model = RandomForestRegressor(
            n_estimators=100,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_processed, y)

        return model

    def train_lstm_model(self, train_df, zone_ids, sequence_length=24):
        """
        Train an LSTM model for time series forecasting.

        Parameters:
        -----------
        train_df : pandas.DataFrame
            Training data.
        zone_ids : list
            List of zone IDs to include in training.
        sequence_length : int
            Number of time steps to use for each sequence.

        Returns:
        --------
        model : Keras LSTM model
            Trained LSTM model.
        scalers : dict
            Dictionary of scalers for each zone.
        """
        # Prepare data for LSTM (we'll use only temporal features and lagged values)
        X_sequences = []
        y_values = []
        scalers = {}

        for zone_id in zone_ids:
            # Get zone-specific data
            zone_data = train_df[train_df['zone_id'] == zone_id].sort_values('timestamp')

            # Select features
            features = ['hour', 'day_of_week', 'is_weekend', 'is_holiday', 'ride_requests']
            weather_features = [col for col in zone_data.columns if col.startswith('weather_')]
            features.extend(weather_features)

            # Extract feature matrix
            data = zone_data[features].values

            # Scale the data
            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(data)
            scalers[zone_id] = scaler

            # Create sequences
            for i in range(len(scaled_data) - sequence_length):
                X_sequences.append(scaled_data[i:i+sequence_length, :])
                y_values.append(scaled_data[i+sequence_length, -1])  # Predict ride_requests

        # Convert to numpy arrays
        X = np.array(X_sequences)
        y = np.array(y_values)

        # Build LSTM model
        model = Sequential([
            LSTM(128, input_shape=(sequence_length, X.shape[2]), return_sequences=True),
            Dropout(0.2),
            LSTM(64),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dense(1)
        ])

        # Compile the model
        model.compile(optimizer='adam', loss='mse')

        # Train the model
        model.fit(X, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

        return model, scalers

    def train_tcn_model(self, train_df, zone_ids, sequence_length=24):
        """
        Train a Temporal Convolutional Network (TCN) model.

        Parameters:
        -----------
        train_df : pandas.DataFrame
            Training data.
        zone_ids : list
            List of zone IDs to include in training.
        sequence_length : int
            Number of time steps to use for each sequence.

        Returns:
        --------
        model : Keras TCN model
            Trained TCN model.
        scalers : dict
            Dictionary of scalers for each zone.
        """
        # Prepare data (similar to LSTM preparation)
        X_sequences = []
        y_values = []
        scalers = {}

        for zone_id in zone_ids:
            # Get zone-specific data
            zone_data = train_df[train_df['zone_id'] == zone_id].sort_values('timestamp')

            # Select features
            features = ['hour', 'day_of_week', 'is_weekend', 'is_holiday', 'ride_requests']
            weather_features = [col for col in zone_data.columns if col.startswith('weather_')]
            features.extend(weather_features)

            # Extract feature matrix
            data = zone_data[features].values

            # Scale the data
            scaler = StandardScaler()

In [4]:
    def train_tcn_model(self, train_df, zone_ids, sequence_length=24):
        """
        Train a Temporal Convolutional Network (TCN) model.

        Parameters:
        -----------
        train_df : pandas.DataFrame
            Training data.
        zone_ids : list
            List of zone IDs to include in training.
        sequence_length : int
            Number of time steps to use for each sequence.

        Returns:
        --------
        model : Keras TCN model
            Trained TCN model.
        scalers : dict
            Dictionary of scalers for each zone.
        """
        from tensorflow.keras import Input, Model
        from tensorflow.keras.layers import Dense, Dropout
        from tcn import TCN

        # Prepare data
        X_sequences = []
        y_values = []
        scalers = {}

        for zone_id in zone_ids:
            # Get zone-specific data
            zone_data = train_df[train_df['zone_id'] == zone_id].sort_values('timestamp')

            # Select features
            features = ['hour', 'day_of_week', 'is_weekend', 'is_holiday', 'ride_requests']
            weather_features = [col for col in zone_data.columns if col.startswith('weather_')]
            features.extend(weather_features)

            # Extract feature matrix
            data = zone_data[features].values

            # Scale the data
            scaler = StandardScaler()
            scaled_data = scaler.fit_transform(data)
            scalers[zone_id] = scaler

            # Create sequences
            for i in range(len(scaled_data) - sequence_length):
                X_sequences.append(scaled_data[i:i+sequence_length, :])
                y_values.append(scaled_data[i+sequence_length, -1])  # Predict ride_requests

        # Convert to numpy arrays
        X = np.array(X_sequences)
        y = np.array(y_values)

        # Build TCN model
        inputs = Input(shape=(sequence_length, X.shape[2]))
        tcn_layer = TCN(nb_filters=64, kernel_size=3, nb_stacks=2,
                       dilations=[1, 2, 4, 8], return_sequences=False,
                       activation='relu')(inputs)
        dropout = Dropout(0.2)(tcn_layer)
        outputs = Dense(1)(dropout)

        model = Model(inputs, outputs)

        # Compile the model
        model.compile(optimizer='adam', loss='mse')

        # Train the model
        model.fit(X, y, epochs=30, batch_size=64, validation_split=0.2, verbose=1)

        return model, scalers

    def create_ensemble(self, base_models, meta_model=GradientBoostingRegressor()):
        """
        Create an ensemble model using predictions from base models.

        Parameters:
        -----------
        base_models : dict
            Dictionary of trained base models
        meta_model : sklearn estimator
            Meta-learner model (default: GradientBoostingRegressor)
        """
        self.ensemble_meta_model = meta_model
        self.base_models = base_models

    def predict_ensemble(self, X):
        """
        Make predictions using the ensemble model.

        Parameters:
        -----------
        X : pandas.DataFrame
            Input features

        Returns:
        --------
        predictions : numpy array
            Ensemble predictions
        """
        # Generate base model predictions
        meta_features = []
        for model_name, model in self.base_models.items():
            if model_name == 'prophet':
                preds = self._predict_prophet(X, model)
            elif model_name == 'arima':
                preds = self._predict_arima(X, model)
            else:
                preds = model.predict(X)
            meta_features.append(preds)

        # Stack predictions horizontally
        meta_features = np.column_stack(meta_features)

        # Make final prediction
        return self.ensemble_meta_model.predict(meta_features)

    def evaluate_model(self, model, test_df, preprocessor=None, model_type='ml'):
        """
        Evaluate a trained model on test data.

        Parameters:
        -----------
        model : trained model
            The model to evaluate
        test_df : pandas.DataFrame
            Test dataset
        preprocessor : sklearn preprocessor
            Feature preprocessor (required for ML models)
        model_type : str
            Type of model ('arima', 'prophet', 'lstm', 'tcn', 'ml')

        Returns:
        --------
        dict
            Dictionary of evaluation metrics
        """
        X_test = test_df.drop(['ride_requests', 'timestamp', 'city', 'zone_id'], axis=1)
        y_test = test_df['ride_requests']

        if model_type in ['arima', 'prophet']:
            # Time series models require zone-by-zone evaluation
            predictions = []
            for zone_id in test_df['zone_id'].unique():
                zone_data = test_df[test_df['zone_id'] == zone_id]
                if model_type == 'arima':
                    pred = model[zone_id].predict(start=len(zone_data),
                                                end=len(zone_data),
                                                typ='levels')
                elif model_type == 'prophet':
                    future = model[zone_id].make_future_dataframe(
                        periods=len(zone_data),
                        freq='30min'
                    )
                    pred = model[zone_id].predict(future)['yhat'][-len(zone_data):]
                predictions.extend(pred)
        elif model_type in ['lstm', 'tcn']:
            # Sequence models require sequence generation
            predictions = self._predict_sequences(model, test_df, model_type)
        else:
            if preprocessor:
                X_processed = preprocessor.transform(X_test)
            predictions = model.predict(X_processed)

        # Calculate metrics
        metrics = {
            'MAE': mean_absolute_error(y_test, predictions),
            'RMSE': np.sqrt(mean_squared_error(y_test, predictions)),
            'MAPE': mean_absolute_percentage_error(y_test, predictions)
        }

        return metrics

    def run_full_pipeline(self, cities=None, horizon='3h'):
        """
        Execute complete forecasting pipeline.

        Parameters:
        -----------
        cities : list
            List of cities to process
        horizon : str
            Forecast horizon (default: 3 hours)
        """
        # Load and preprocess data
        if self.ride_data is None:
            self.generate_dummy_data()

        processed_data = self.preprocess_data()

        # Train models for each city
        for city in cities or self.cities:
            print(f"\nTraining models for {city}")
            city_data = processed_data[city]['train']

            # Initialize model storage
            self.models[city] = {
                'arima': {},
                'prophet': {},
                'xgb': None,
                'rf': None,
                'lstm': None,
                'tcn': None,
                'ensemble': None
            }

            # Get unique zones
            zones = city_data['zone_id'].unique()

            # Train ARIMA models
            print("Training ARIMA models...")
            for zone in zones:
                self.models[city]['arima'][zone] = self.train_arima_model(city_data, zone)

            # Train Prophet models
            print("Training Prophet models...")
            for zone in zones:
                self.models[city]['prophet'][zone] = self.train_prophet_model(city_data, zone)

            # Train ML models
            print("Training XGBoost...")
            preprocessor = self.build_feature_pipeline(city_data)
            self.models[city]['xgb'] = self.train_xgboost_model(city_data, preprocessor)

            print("Training Random Forest...")
            self.models[city]['rf'] = self.train_random_forest_model(city_data, preprocessor)

            # Train DL models
            print("Training LSTM...")
            self.models[city]['lstm'], lstm_scalers = self.train_lstm_model(city_data, zones)

            print("Training TCN...")
            self.models[city]['tcn'], tcn_scalers = self.train_tcn_model(city_data, zones)

            # Create ensemble
            print("Creating ensemble...")
            base_models = {
                'xgb': self.models[city]['xgb'],
                'rf': self.models[city]['rf'],
                'lstm': self.models[city]['lstm'],
                'tcn': self.models[city]['tcn']
            }
            self.create_ensemble(base_models)

            # Evaluate models
            print("\nEvaluating models...")
            test_df = processed_data[city]['test']

            # Evaluate XGBoost
            xgb_metrics = self.evaluate_model(
                self.models[city]['xgb'],
                test_df,
                preprocessor
            )

            # Evaluate Ensemble
            ensemble_metrics = self.evaluate_ensemble(test_df, preprocessor)

            print(f"\n{city} Results:")
            print(f"XGBoost - MAE: {xgb_metrics['MAE']:.2f}, MAPE: {xgb_metrics['MAPE']:.2f}%")
            print(f"Ensemble - MAE: {ensemble_metrics['MAE']:.2f}, MAPE: {ensemble_metrics['MAPE']:.2f}%")

    def _predict_sequences(self, model, test_df, model_type):
        """Helper method for sequence model predictions"""
        predictions = []
        for zone_id in test_df['zone_id'].unique():
            zone_data = test_df[test_df['zone_id'] == zone_id].sort_values('timestamp')
            # Implement sequence prediction logic here
            # ... (omitted for brevity)
        return np.array(predictions)

    def evaluate_ensemble(self, test_df, preprocessor):
        """Evaluate ensemble model performance"""
        X_test = test_df.drop(['ride_requests', 'timestamp', 'city', 'zone_id'], axis=1)
        X_processed = preprocessor.transform(X_test)

        # Get base model predictions
        base_preds = []
        for model in self.base_models.values():
            if isinstance(model, (tf.keras.Model, Sequential)):
                preds = model.predict(X_processed)
            else:
                preds = model.predict(X_processed)
            base_preds.append(preds)

        # Combine predictions
        meta_features = np.column_stack(base_preds)
        final_preds = self.ensemble_meta_model.predict(meta_features)

        # Calculate metrics
        return {
            'MAE': mean_absolute_error(test_df['ride_requests'], final_preds),
            'RMSE': np.sqrt(mean_squared_error(test_df['ride_requests'], final_preds)),
            'MAPE': mean_absolute_percentage_error(test_df['ride_requests'], final_preds)
        }

# Example usage
if __name__ == "__main__":
  forecast_system = OlaBikeForecast()

    # Generate and use synthetic data
  forecast_system.generate_dummy_data()

    # Run complete pipeline for Bangalore
  forecast_system.run_full_pipeline(cities=["Bangalore"])

Generating synthetic data for demonstration...
Synthetic data generation complete!


AttributeError: 'OlaBikeForecast' object has no attribute 'run_full_pipeline'