In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['date'] = pd.to_datetime(df['Datetime'].dt.date)

# Feature Engineering: Creating lag features and trend correction
def create_features(df, lags=60):
    df = df.sort_values(by=['date'])
    for lag in range(1, lags + 1):
        df[f'lag_{lag}'] = df['PM2.5'].shift(lag)
    df['rolling_mean'] = df['PM2.5'].rolling(window=14, min_periods=1).mean()
    df['rolling_std'] = df['PM2.5'].rolling(window=14, min_periods=1).std()
    df['month'] = df['date'].dt.month
    df['dayofweek'] = df['date'].dt.dayofweek
    df = df.dropna()
    return df

# Prepare dataset for LSTM
def create_lstm_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length, 0]  # PM2.5 target
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Function to train models and make predictions
def train_and_predict_hybrid(city, future_date):
    df_city = df[df['City'] == city].copy()
    df_city = df_city.groupby('date')[['PM2.5']].mean().reset_index()
    df_city = create_features(df_city)
    
    # Splitting data
    X = df_city.drop(columns=['date', 'PM2.5'])
    y = df_city['PM2.5']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Scaling features
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_train_scaled = scaler_x.fit_transform(X_train)
    X_test_scaled = scaler_x.transform(X_test)
    y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
    y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))
    
    # Train XGBoost model
    xgb_model = xgb.XGBRegressor(n_estimators=500, max_depth=10, learning_rate=0.05, colsample_bytree=0.8, subsample=0.8, random_state=42)
    xgb_model.fit(X_train_scaled, y_train)
    
    # Prepare LSTM sequences
    SEQ_LENGTH = 60
    X_lstm, y_lstm = create_lstm_sequences(X_train_scaled, SEQ_LENGTH)
    X_lstm_test, y_lstm_test = create_lstm_sequences(X_test_scaled, SEQ_LENGTH)
    
    # Train LSTM model
    lstm_model = Sequential([
        LSTM(50, activation='relu', input_shape=(SEQ_LENGTH, X_lstm.shape[2])),
        Dense(1)
    ])
    lstm_model.compile(optimizer='adam', loss='mse')
    lstm_model.fit(X_lstm, y_lstm, epochs=50, batch_size=16, verbose=1)
    
    # Prepare future prediction
    future_date = pd.to_datetime(future_date)
    latest_data = X.iloc[-SEQ_LENGTH:].copy()
    latest_data_scaled = scaler_x.transform(latest_data)
    
    xgb_future_pred = xgb_model.predict(latest_data_scaled[-1:].reshape(1, -1))[0]
    lstm_future_pred = lstm_model.predict(latest_data_scaled.reshape(1, SEQ_LENGTH, latest_data_scaled.shape[1]))[0][0]
    
    # Combine predictions (Hybrid Approach)
    hybrid_future_pred = (xgb_future_pred + scaler_y.inverse_transform(np.array([[lstm_future_pred]]))[0][0]) / 2
    
    # Visualization
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.values, label='Actual PM2.5', color='black')
    plt.plot(xgb_model.predict(X_test_scaled), label='XGBoost Predictions', linestyle='dashed', color='blue')
    plt.plot(scaler_y.inverse_transform(lstm_model.predict(X_lstm_test).reshape(-1, 1)), label='LSTM Predictions', linestyle='dashed', color='red')
    plt.legend()
    plt.title(f'PM2.5 Predictions vs Actual for {city}')
    plt.xlabel('Days')
    plt.ylabel('PM2.5')
    plt.grid()
    plt.show()
    
    return {
        "City": city,
        "Future Date": future_date.strftime('%Y-%m-%d'),
        "Predicted PM2.5": round(hybrid_future_pred, 2)
    }

# Example usage
if __name__ == "__main__":
    city = input("Enter city name: ")
    future_date = input("Enter future date (YYYY-MM-DD): ")
    result = train_and_predict_hybrid(city, future_date)
    print(result)
