In [None]:
# import tensorflow as tf
# print(tf.__version__)  # Should output without errors


: 

In [None]:
# !pip install tensorflow==2.13
# !pip install tensorflow-gpu


In [8]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [9]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [10]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

def lstm_regression_lags_only(
    series,
    district_name,
    max_lags=12,
    differencing=True,
    epochs=100,
    batch_size=32,
    verbose=0
):
    """
    Robust LSTM forecasting with full error handling
    Returns (None, None) on error to maintain pipeline flow
    """
    try:
        # 0. Initial checks
        if len(series) < max_lags + 24:  # Minimum 24 points after lag creation
            raise ValueError(f"Series too short ({len(series)} points) for {max_lags} lags")

        # 1. Directory setup
        os.makedirs('LSTM_Forecasts', exist_ok=True)
        
        # 2. Stationarity handling
        original_series = series.copy()
        d = 0
        if differencing:
            adf_result = adfuller(series.dropna())
            if adf_result[1] > 0.05:
                d = 1
                series = series.diff().dropna()
                if len(series) < 10:
                    raise ValueError("Insufficient data after differencing")

        # 3. Create lag features
        df = pd.DataFrame({'y': series})
        for lag in range(1, max_lags + 1):
            df[f'lag_{lag}'] = df['y'].shift(lag)
        df = df.dropna()

        # 4. Temporal split with index preservation
        train_size = int(len(df) * 0.8)
        if train_size < 10 or (len(df) - train_size) < 2:
            raise ValueError("Insufficient train/test split")
            
        train = df.iloc[:train_size]
        test = df.iloc[train_size:]
        test_series = test['y']  # Preserve for index

        # 5. Prepare numpy arrays
        X_train = train.drop(columns=['y']).values
        y_train = train['y'].values.reshape(-1, 1)
        X_test = test.drop(columns=['y']).values
        y_test_np = test['y'].values.reshape(-1, 1)

        # 6. Scaling
        scaler_X = MinMaxScaler()
        scaler_y = MinMaxScaler()

        X_train_scaled = scaler_X.fit_transform(X_train)
        X_test_scaled = scaler_X.transform(X_test)
        y_train_scaled = scaler_y.fit_transform(y_train)
        y_test_scaled = scaler_y.transform(y_test_np)

        # 7. Reshape for LSTM [samples, timesteps, features]
        X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
        X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

        # 8. Build model
        model = Sequential()
        model.add(LSTM(50, activation='relu', input_shape=(1, max_lags)))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')

        # 9. Train with validation
        history = model.fit(
            X_train_scaled, 
            y_train_scaled,
            epochs=epochs,
            batch_size=batch_size,
            verbose=verbose,
            validation_split=0.2,
            shuffle=False
        )

        # 10. Predict and inverse scaling
        y_pred_scaled = model.predict(X_test_scaled, verbose=verbose)
        y_pred = scaler_y.inverse_transform(y_pred_scaled).flatten()

        # 11. Inverse differencing
        if d == 1:
            last_train_value = original_series.iloc[train.index[0] - 1]
            y_pred = np.cumsum(y_pred) + last_train_value
            y_test_values = original_series.iloc[test.index].values
        else:
            y_test_values = test_series.values

        # 12. Create output (preserve original indices)
        forecast_df = pd.DataFrame({
            'district': district_name,
            'date': test_series.index,
            'actual': y_test_values,
            'forecast': y_pred
        })

        # 13. Calculate metrics
        rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast']))

        # 14. Save results
        forecast_df.to_csv(
            os.path.join('LSTM_Forecasts', f'{district_name}_forecast.csv'), 
            index=False
        )

        # 15. Plotting
        plt.figure(figsize=(12, 6))
        plt.plot(original_series.index, original_series, label='Original')
        plt.plot(forecast_df['date'], forecast_df['forecast'], label='LSTM Forecast')
        plt.title(f'{district_name} Forecast (RMSE: {rmse:.2f})')
        plt.legend()
        plt.savefig(os.path.join('LSTM_Forecasts', f'{district_name}_forecast_plot.png'))
        plt.close()

        return {
            'district': district_name,
            'rmse': rmse,
            'epochs': epochs,
            'batch_size': batch_size,
            'training_loss': history.history['loss'],
            'validation_loss': history.history['val_loss']
        }, forecast_df

    except Exception as e:
        print(f"⚠️ Skipping {district_name}: {str(e)}")
        return None, None


In [11]:
districts = data['district'].unique()
rmse_values = []

# Usage Example


def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I1"].asfreq('MS')
        
        results, forecast = results, forecast = lstm_regression_lags_only(
            series=ts,
            district_name=district,
            max_lags=6,
            epochs=150,
            batch_size=64
        )

        
    
        # Show results
        print("=== Metrics ===")
        print(results)
        print("\n=== Forecast Data ===")
        print(forecast)
    
    return results
run_for_each_district()

=== Metrics ===
{'district': 'AHMEDNAGAR', 'rmse': 507.9592158801404, 'epochs': 150, 'batch_size': 64, 'training_loss': [0.35049962997436523, 0.34354767203330994, 0.33667483925819397, 0.329849511384964, 0.3230341672897339, 0.3162636160850525, 0.30954596400260925, 0.3028698265552521, 0.2962683439254761, 0.28972986340522766, 0.283244252204895, 0.2768010199069977, 0.27043673396110535, 0.2641189694404602, 0.25781962275505066, 0.2515849173069, 0.2454356849193573, 0.23937299847602844, 0.2333759367465973, 0.22747524082660675, 0.22165805101394653, 0.21592122316360474, 0.2102808654308319, 0.20472441613674164, 0.19924041628837585, 0.19385720789432526, 0.18856488168239594, 0.18336078524589539, 0.17827358841896057, 0.1732843965291977, 0.1684088408946991, 0.16365788877010345, 0.15903513133525848, 0.15455707907676697, 0.15019425749778748, 0.1459663063287735, 0.14189723134040833, 0.13799460232257843, 0.13425663113594055, 0.13067875802516937, 0.12724480032920837, 0.12398853898048401, 0.120914652943611