In [None]:

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy
import matplotlib.pyplot as plt
from datetime import timedelta
import json
import os
import matplotlib.dates as mdates
import random

# SMAPE calculation function
def smape(yTrue, yPred):
    denominator = (np.abs(yTrue) + np.abs(yPred))
    return np.mean(200 * np.abs(yPred - yTrue) / denominator)

# Exponential Smoothing
def exponential_smoothing(series, alpha):
    result = [series[0]]
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return np.array(result)

# Double Exponential Smoothing
def double_exponential_smoothing(series, alpha, beta):
    result = [series[0]]
    level, trend = series[0], series[1] - series[0]
    for n in range(1, len(series)):
        value = series[n]
        last_level, level = level, alpha * value + (1 - alpha) * (level + trend)
        trend = beta * (level - last_level) + (1 - beta) * trend
        result.append(level + trend)
    return np.array(result)

# Prepare multivariate data for LSTM input
def prepare_multivariate_data(data, n_input, n_features):
    X, y = [], []
    for i in range(len(data) - n_input):
        X.append(data[i:(i + n_input), :])
        y.append(data[i + n_input, -1])
    return np.array(X), np.array(y)

# Build the LSTM Model with Monte Carlo Dropout
def build_mc_dropout_model(n_input, n_features, layer, unit, dropout_rate, activation='relu', optimizer='adam'):
    model = Sequential()
    model.add(Input(shape=(n_input, n_features)))
    model.add(LSTM(unit[0], activation=activation, return_sequences=(layer > 1)))
    model.add(Dropout(dropout_rate))
    for i in range(1, layer):
        model.add(LSTM(unit[min(i, len(unit)-1)], activation=activation, return_sequences=(i < layer-1)))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    return model

# Monte Carlo Dropout prediction function
def mc_dropout_predict(model, X, n_iter=100):
    predictions = np.array([model(X, training=True) for _ in range(n_iter)])
    return predictions.mean(axis=0), predictions.std(axis=0)

# Function to generate future dates
def generate_future_dates(start_date, periods):
    return [start_date + timedelta(days=i*30) for i in range(periods)] 

# Boruta feature selection
def boruta_feature_selection(data, target_variable, features):
    X = data[features].values
    y = data[target_variable].values

    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    boruta_selector = BorutaPy(rf, n_estimators='auto', random_state=42)
    
    boruta_selector.fit(X, y)
    
    selected_features = [features[i] for i in range(len(features)) if boruta_selector.support_[i]]
    
    if len(selected_features) >= 3:
        best_features = selected_features[:3]
    elif len(selected_features) >= 2:
        best_features = selected_features[:2]
    elif len(selected_features) >= 1:
        best_features = selected_features[:1]
    else:
        best_features = []
    
    return best_features

# Function to run seed-based forecasting and plot results
def run_seed_forecasting(target_variable, selected_data, scaler, model, forecast_horizon, data, last_date, output_plot_dir, seed=None):
    if seed is not None:
        np.random.seed(seed)
        tf.random.set_seed(seed)

    last_sequence = selected_data[-model.input_shape[1]:]
    forecasts, lower_bounds, upper_bounds = [], [], []
    
    for _ in range(forecast_horizon):
        next_prediction_mean, next_prediction_std = mc_dropout_predict(model, last_sequence.reshape(1, model.input_shape[1], model.input_shape[2]))
        forecasts.append(next_prediction_mean[0, 0])
        
        lower_bound_scaled = next_prediction_mean[0, 0] - 1.96 * next_prediction_std[0, 0]
        upper_bound_scaled = next_prediction_mean[0, 0] + 1.96 * next_prediction_std[0, 0]
        
        lower_bounds.append(lower_bound_scaled)
        upper_bounds.append(upper_bound_scaled)
        
        last_sequence = np.roll(last_sequence, -1, axis=0)
        last_sequence[-1, -1] = next_prediction_mean[0, 0]
    
    dummy_array = np.zeros((len(forecasts), model.input_shape[2]))
    dummy_array[:, -1] = forecasts
    forecasts_inv = scaler.inverse_transform(dummy_array)[:, -1]
    
    dummy_array[:, -1] = lower_bounds
    lower_bounds_inv = scaler.inverse_transform(dummy_array)[:, -1]
    
    dummy_array[:, -1] = upper_bounds
    upper_bounds_inv = scaler.inverse_transform(dummy_array)[:, -1]
    
    future_dates = generate_future_dates(last_date, forecast_horizon)
    future_dates = [last_date] + future_dates
    
    seamless_forecast = np.insert(forecasts_inv, 0, data[target_variable].iloc[-1])
    lower_bounds_inv = np.insert(lower_bounds_inv, 0, data[target_variable].iloc[-1])
    upper_bounds_inv = np.insert(upper_bounds_inv, 0, data[target_variable].iloc[-1])
    
    plt.figure(figsize=(15, 8))
    plt.plot(data.index, data[target_variable], label='Data', color='blue', linestyle='-')
    plt.plot(future_dates, seamless_forecast, label=f'Prediction (Seed {seed})' if seed is not None else 'Prediction', color='red', linestyle='--')
    plt.fill_between(future_dates, lower_bounds_inv, upper_bounds_inv, color='green', alpha=0.2, label='95% Confidence')
    
    plt.title(f'{target_variable} - (M) - Seed {seed}' if seed is not None else f'{target_variable} - (M)')
    plt.xlabel('Year')
    plt.ylabel('Incident Count')
    plt.legend()
    
    plt.gca().xaxis.set_major_locator(mdates.YearLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.gcf().autofmt_xdate()
    
    plot_name = f'{target_variable}_forecast_seed_{seed}.png' if seed is not None else f'{target_variable}_forecast.png'
    plot_path = os.path.join(output_plot_dir, plot_name)
    plt.savefig(plot_path)
    plt.show()
    plt.close()
    
    print(f"Forecast plot for {target_variable} with seed {seed} saved to {plot_path}.")

# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    data = pd.read_csv('FinalDataset.csv')
    data['Date'] = pd.to_datetime(data['Date'], format='%b-%y')
    data.set_index('Date', inplace=True)

    attacks = ['Phishing-ALL', 'Ransomware-ALL', 'Password Attack-ALL', 'SQL Injection-ALL',
        'Account Hijacking-ALL', 'Defacement-ALL', 'Trojan-ALL', 'Vulnerability-ALL', 'Zero-day-ALL',
        'Malware-ALL', 'Advanced persistent threat-ALL', 'XSS-ALL', 'Data Breach-ALL',
        'Disinformation/Misinformation-ALL', 'Targeted Attack-ALL', 'Adware-ALL', 'Brute Force Attack-ALL',
        'Malvertising-ALL', 'Backdoor-ALL', 'Botnet-ALL', 'Cryptojacking-ALL', 'Worms-ALL', 'Spyware-ALL', 'DDoS-ALL',]

    features = [
    # Economic Data
    'GDP-ARE', 'GDP-AUS', 'GDP-AUT', 'GDP-BRA', 'GDP-CAN', 'GDP-CHE', 'GDP-CHN', 'GDP-DEU', 'GDP-EGY', 'GDP-ESP',
    'GDP-FIN', 'GDP-FRA', 'GDP-GBR', 'GDP-IND', 'GDP-IRL', 'GDP-IRN', 'GDP-ISR', 'GDP-ITA', 'GDP-JPN', 'GDP-KOR',
    'GDP-MEX', 'GDP-MYS', 'GDP-NLD', 'GDP-NOR', 'GDP-PAK', 'GDP-PRT', 'GDP-PSE', 'GDP-RUS', 'GDP-SAU', 'GDP-SWE',
    'GDP-TUR', 'GDP-UKR', 'GDP-USA',
    
    # Social Media and Internet Data
    'Internet Users (Millions)', 'Facebook Users (M)', 'Instagram Users (M)', 'Twitter Users (M)', 'LinkedIn Users (M)', 'Email Users (M)',
    
    # Holidays Data
    'Holidays',
    
    # Mentions
    'Mentions-DDoS', 'Mentions-Phishing', 'Mentions-Ransomware', 'Mentions-Password Attack', 'Mentions-SQL Injection',
    'Mentions-Account Hijacking', 'Mentions-Defacement', 'Mentions-Trojan', 'Mentions-Vulnerability', 'Mentions-Zero-day',
    'Mentions-Advanced persistent threat', 'Mentions-XSS', 'Mentions-Malware', 'Mentions-Data Breach', 'Mentions-Disinformation/Misinformation',
    'Mentions-Targeted Attack', 'Mentions-Adware', 'Mentions-Brute Force Attack', 'Mentions-Malvertising', 'Mentions-Backdoor',
    'Mentions-Botnet', 'Mentions-Cryptojacking', 'Mentions-Worms', 'Mentions-Spyware', 'Mentions-MITM', 'Mentions-DNS Spoofing',
    'Mentions-Pegasus Spyware', 'Mentions-CoolWebSearch Spyware', 'Mentions-Gator GAIN Spyware', 'Mentions-180search Assistant Spyware',
    'Mentions-Transponder vx2 Spyware', 'Mentions-WannaCry Ransomware', 'Mentions-Colonial Pipeline Ransomware', 'Mentions-Cryptolocker',
    'Mentions-Dropper', 'Mentions-Wiper', 'Mentions-Pharming', 'Mentions-Insider Threat', 'Mentions-Drive-by', 'Mentions-Rootkit',
    'Mentions-Adversarial Attack', 'Mentions-Data Poisoning', 'Mentions-Deepfake', 'Mentions-Deeplocker', 'Mentions-Supply Chain',
    'Mentions-IoT Device Attack', 'Mentions-Keylogger', 'Mentions-DNS Tunneling', 'Mentions-Session Hijacking', 'Mentions-URL manipulation',
    'Mentions-Unknown'

    ]

    output_plot_dir = 'Forecast_plot_Multivariate'
    os.makedirs(output_plot_dir, exist_ok=True)

    for target_variable in attacks:
        print(f"\nProcessing {target_variable}")
        
        # Perform Boruta feature selection
        selected_features = boruta_feature_selection(data, target_variable, features)
        
        if not selected_features:
            print(f"No significant features found for {target_variable}. Skipping this target.")
            continue
        
        print(f"Selected features for {target_variable}: {selected_features}")
        
        # Include the target variable in the selected features
        selected_features = selected_features + [target_variable]
        
        selected_data = data[selected_features].values
        
        # Load best parameters
        with open(f'best_params_files_25M/{target_variable}_best_params.json', 'r') as f:
            best_params = json.load(f)['Best Parameters']

        # Apply smoothing to the target variable
        if best_params['smoothing_method'] == 'exponential':
            selected_data[:, -1] = exponential_smoothing(selected_data[:, -1], best_params['alpha'])
        elif best_params['smoothing_method'] == 'double_exponential':
            selected_data[:, -1] = double_exponential_smoothing(selected_data[:, -1], best_params['alpha'], best_params['beta'])
        
        # Scale the data
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(selected_data)
        
        # Prepare data for LSTM
        n_input = best_params['n_input']
        n_features = len(selected_features)
        X, y = prepare_multivariate_data(scaled_data, n_input, n_features)

        # Train on Full Data - n_input
        X_train = X[:-n_input]
        y_train = y[:-n_input]
        
        # Build and train the model on the complete dataset
        model = build_mc_dropout_model(n_input, n_features, best_params['layer'], best_params['units'], best_params['dropout_rate'])
        model.fit(X_train, y_train, epochs=best_params['n_epochs'], batch_size=32, verbose=0)
        
        # Generate forecast for the next 3 years (36 months)
        forecast_horizon = 36
        last_date = data.index[-1]
        
        # Run original forecast (no seed)
        
        run_seed_forecasting(target_variable, scaled_data, scaler, model, forecast_horizon, data, last_date, output_plot_dir)
        
        # Run seed-based forecasting
        for seed in [1, 2, 3]:
            run_seed_forecasting(target_variable, scaled_data, scaler, model, forecast_horizon, data, last_date, output_plot_dir, seed=seed)

    print("Forecasting completed for all attack types.")