In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.fft import fft
from scipy.stats import kurtosis, skew
from scipy.ndimage import gaussian_filter1d


def preprocess_data(file_path, appliance_type):
    data = pd.read_csv(file_path)
    data['time'] = pd.to_datetime(data['time']).dt.tz_localize(None)
    data.set_index('time', inplace=True)
    data['kWh.mean_value'] = data['kWh.mean_value'].interpolate().ffill().bfill()

    data['day_of_week'] = data.index.dayofweek
    data['hour'] = data.index.hour
    data['is_weekend'] = (data['day_of_week'] >= 5).astype(int)
    data['month'] = data.index.month
    data['season'] = (data['month'] % 12 + 3) // 3

    energy_scaler = MinMaxScaler()
    data['kWh.mean_value'] = energy_scaler.fit_transform(data[['kWh.mean_value']]).astype(np.float32)
    
    time_scaler = MinMaxScaler()
    time_features = ['day_of_week', 'hour', 'is_weekend', 'month', 'season']
    data[time_features] = time_scaler.fit_transform(data[time_features]).astype(np.float32)
    
    return {
        'data': data,
        'energy_scaler': energy_scaler,
        'time_scaler': time_scaler
    }




np.random.seed(48)
tf.random.set_seed(48)



def prepare_sequences(data, seq_len=6):
    X = []
    for i in range(len(data) - seq_len + 1):
        X.append(data.iloc[i:i+seq_len].values)
    return np.array(X, dtype=np.float32)

class MultiScaleAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_dim):
        super(MultiScaleAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=hidden_dim)
        
    def call(self, inputs):
        attention_output = self.attention(inputs, inputs)
        return inputs + attention_output

class TimeGAN(tf.keras.Model):
    def __init__(self, seq_len, n_features, hidden_dim, appliance_type):
        super(TimeGAN, self).__init__()
        self.seq_len = seq_len
        self.n_features = n_features
        self.hidden_dim = hidden_dim
        self.appliance_type = appliance_type
        
        self.generator = self.build_generator()
        self.discriminator = self.build_discriminator()
        self.embedder = self.build_embedder()
        self.recovery = self.build_recovery()
        self.peak_generator = self.build_peak_generator()

        self.sparsity_weight = tf.Variable(1.0, trainable=False)
    
    def build_generator(self):
        inputs = tf.keras.Input(shape=(self.seq_len, self.n_features))
        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.hidden_dim, return_sequences=True))(inputs)
        x = MultiScaleAttention(self.hidden_dim)(x)
        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.hidden_dim, return_sequences=True))(x)
        x = MultiScaleAttention(self.hidden_dim)(x)
        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.hidden_dim, return_sequences=True))(x)
        outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.n_features))(x)
        return tf.keras.Model(inputs=inputs, outputs=outputs)
    
    def build_discriminator(self):
        inputs = tf.keras.Input(shape=(self.seq_len, self.n_features))
        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.hidden_dim, return_sequences=True))(inputs)
        x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.hidden_dim))(x)
        outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
        return tf.keras.Model(inputs=inputs, outputs=outputs)
    
    def build_embedder(self):
        return tf.keras.Sequential([
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.hidden_dim, return_sequences=True, input_shape=(self.seq_len, self.n_features))),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.hidden_dim, return_sequences=False)),
            tf.keras.layers.Dense(self.hidden_dim)
        ])
    
    def build_recovery(self):
        return tf.keras.Sequential([
            tf.keras.layers.RepeatVector(self.seq_len),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.hidden_dim, return_sequences=True)),
            tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.n_features))
        ])

    def build_peak_generator(self):
        inputs = tf.keras.Input(shape=(self.seq_len, self.n_features))
        x = tf.keras.layers.Dense(self.hidden_dim, activation='relu')(inputs)
        x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
        return tf.keras.Model(inputs=inputs, outputs=x)

    @tf.function
    def train_step(self, real_data):
        batch_size = tf.shape(real_data)[0]
        random_noise = tf.random.normal([batch_size, self.seq_len, self.n_features], dtype=tf.float32)
        
        with tf.GradientTape(persistent=True) as tape:
            fake_data = self.generator(random_noise, training=True)
            
            real_output = self.discriminator(real_data, training=True)
            fake_output = self.discriminator(fake_data, training=True)
            
            real_embedded = self.embedder(real_data, training=True)
            fake_embedded = self.embedder(fake_data, training=True)
            
            recovered_data = self.recovery(real_embedded, training=True)
            
            peak_real = self.peak_generator(real_data, training=True)
            peak_fake = self.peak_generator(fake_data, training=True)
            
            g_loss = -tf.reduce_mean(tf.math.log(fake_output + 1e-8))
            d_loss = -tf.reduce_mean(tf.math.log(real_output + 1e-8) + tf.math.log(1. - fake_output + 1e-8))
            e_loss = tf.reduce_mean(tf.square(real_embedded - fake_embedded))
            r_loss = tf.reduce_mean(tf.abs(recovered_data - real_data))
            
            daily_pattern_loss = tf.reduce_mean(tf.abs(tf.reduce_mean(fake_data[:, :, 0], axis=0) - tf.reduce_mean(real_data[:, :, 0], axis=0)))
            weekly_pattern_loss = tf.reduce_mean(tf.abs(tf.reduce_mean(fake_data[:, :, 1], axis=1) - tf.reduce_mean(real_data[:, :, 1], axis=1)))
            variability_loss = tf.abs(tf.math.reduce_std(fake_data[:, :, 0]) - tf.math.reduce_std(real_data[:, :, 0]))
            scale_loss = tf.abs(tf.reduce_max(fake_data[:, :, 0]) - tf.reduce_max(real_data[:, :, 0]))
            
            pattern_loss = daily_pattern_loss + weekly_pattern_loss + variability_loss + scale_loss
            
            peak_loss = tf.reduce_mean(tf.abs(peak_real - peak_fake))
            
            if self.appliance_type == 'PC':
                sparsity_threshold = 0.2
                sparsity_loss = tf.reduce_mean(tf.maximum(fake_data[:, :, 0] - sparsity_threshold, 0))
                total_loss = g_loss + d_loss + e_loss + r_loss + 30 * pattern_loss + self.sparsity_weight * 80 * sparsity_loss + 75 * peak_loss
            
            elif self.appliance_type == 'water':
                sparsity_threshold = 0.05
                sparsity_loss = tf.reduce_mean(tf.maximum(fake_data[:, :, 0] - sparsity_threshold, 0))
                total_loss = g_loss + d_loss + e_loss + r_loss + 45 * pattern_loss + self.sparsity_weight * sparsity_loss + 30 * peak_loss
            
            elif self.appliance_type == 'microwave':
                smoothness_loss = tf.reduce_mean(tf.abs(fake_data[:, 1:, 0] - fake_data[:, :-1, 0]))
                total_loss = g_loss + d_loss + e_loss + r_loss + 110 * pattern_loss + 70 * smoothness_loss + 35 * peak_loss

            elif self.appliance_type == 'coffee':
                weekday_mask = tf.cast(tf.less(real_data[:, :, 1], 4), tf.float32)  # Monday to Thursday
                weekend_mask = 1 - weekday_mask  # Friday to Sunday

                hour = real_data[:, :, 2] * 24
                morning_peak_mask = tf.cast(tf.logical_and(tf.greater_equal(hour, 3), tf.less(hour, 7)), tf.float32)
                afternoon_peak_mask = tf.cast(tf.logical_and(tf.greater_equal(hour, 13), tf.less(hour, 15)), tf.float32)

                weekday_loss = tf.reduce_mean(tf.abs(0.357 - fake_data[:, :, 0] * weekday_mask))
                weekend_loss = tf.reduce_mean(tf.abs(0.032 - fake_data[:, :, 0] * weekend_mask))
                peak_loss = tf.reduce_mean(tf.maximum(0.5 - fake_data[:, :, 0] * (morning_peak_mask + afternoon_peak_mask), 0))

                total_loss = g_loss + d_loss + e_loss + r_loss + 30 * pattern_loss + \
                         100 * weekday_loss + 200 * weekend_loss + 150 * peak_loss
                
                
                
            elif self.appliance_type == 'printer':
                peak_loss = tf.reduce_mean(tf.abs(tf.reduce_max(fake_data[:, :, 0], axis=1) - tf.reduce_max(real_data[:, :, 0], axis=1)))
                variability_loss = tf.abs(tf.math.reduce_std(fake_data[:, :, 0]) - tf.math.reduce_std(real_data[:, :, 0]))
                total_loss = g_loss + d_loss + e_loss + r_loss + 30 * pattern_loss + 50 * peak_loss + 20 * variability_loss
            else:
                smoothness_loss = tf.reduce_mean(tf.abs(fake_data[:, 1:, 0] - fake_data[:, :-1, 0]))
                total_loss = g_loss + d_loss + e_loss + r_loss + 110 * pattern_loss + 15 * smoothness_loss + 50 * peak_loss 
        
        variables = self.generator.trainable_variables + self.discriminator.trainable_variables + \
                    self.embedder.trainable_variables + self.recovery.trainable_variables + \
                    self.peak_generator.trainable_variables
        
        gradients = tape.gradient(total_loss, variables)
        gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
        self.optimizer.apply_gradients(zip(gradients, variables))
        
        if self.appliance_type == 'PC':
            if sparsity_loss > 0.1:
                self.sparsity_weight.assign(tf.minimum(self.sparsity_weight * 5, 10.0))
            else:
                self.sparsity_weight.assign(tf.maximum(self.sparsity_weight * 0.9, 0.1))
        
        return g_loss, d_loss, e_loss, r_loss, pattern_loss, peak_loss

def post_process_appliance(synthetic_df, appliance_type, energy_scaler, original_data):
    synthetic_df['kWh.mean_value'] = np.maximum(synthetic_df['kWh.mean_value'], 0)
    
    if appliance_type == 'printer':
        synthetic_df['kWh.mean_value'] = custom_printer_post_process(synthetic_df['kWh.mean_value'].values)
    elif appliance_type == 'water':
        synthetic_df['kWh.mean_value'] = custom_water_post_process(synthetic_df['kWh.mean_value'].values)
        
    elif appliance_type == 'coffee':
        synthetic_df['kWh.mean_value'] = custom_coffee_post_process(synthetic_df['kWh.mean_value'].values, synthetic_df.index)
  
    elif appliance_type =='microwave':
        synthetic_df['kWh.mean_value'] = custom_microwave_post_process(synthetic_df['kWh.mean_value'].values)
    return synthetic_df







def custom_microwave_post_process(data):
    # Ensure data is in float32 format
    data = data.astype(np.float32)
    
    # Apply low usage pattern
    threshold = np.percentile(data, 10)
    low_usage_mask = data < threshold
    data[low_usage_mask] *= np.random.beta(0.6, 0.95, size=low_usage_mask.sum()).astype(np.float32)
    
    # Apply high usage pattern
    peak_threshold = np.percentile(data, 95)
    peak_mask = data > peak_threshold
    data[peak_mask] *= 0.95
    data[~peak_mask] *= 0.45
    
    # Add occasional spikes
    spike_prob = 0.07
    spike_mask = np.random.random(data.shape) < spike_prob
    data[spike_mask] = np.random.uniform(0.65, 0.95, size=spike_mask.sum()).astype(np.float32)
    
    # Ensure all values are within [0, 1] range
    data = np.clip(data, 0, 1)
    
    return data




def custom_water_post_process(data):
    # Apply a transformation to ground more often
    data = np.clip(data, a_min=0, a_max=None)  # Ensure data is non-negative
    data = data * np.random.uniform(0.6, 1.2, size=data.shape)  # Lower the overall baseline
    return data

def custom_printer_post_process(data):
    
    baseline = np.min(data)
    data = np.full_like(data, baseline)
    
    # frequent small spikes
    small_spike_prob = 0.05
    small_spike_mask = np.random.random(data.shape) < small_spike_prob
    data[small_spike_mask] = np.random.uniform(0.01, 0.05, size=small_spike_mask.sum())
    
    #  occasional larger spikes
    large_spike_prob = 0.25
    large_spike_mask = np.random.random(data.shape) < large_spike_prob
    data[large_spike_mask] = np.random.uniform(0.5, 1.05, size=large_spike_mask.sum())
    
    # noise in non-spike periods
    non_spike_mask = ~(small_spike_mask | large_spike_mask)
    data[non_spike_mask] += np.random.uniform(0.15, 0.25, size=non_spike_mask.sum())
    
    # Smooth the data 
    data = gaussian_filter1d(data, sigma=0.6)
    
    return data



def custom_coffee_post_process(data, dates):
     # Create masks for weekdays and weekends
    weekday_mask = (dates.dayofweek < 4)  # Monday to Thursday
    weekend_mask = ~weekday_mask  # Friday to Sunday

    # Create hour of day
    hour = dates.hour + dates.minute / 60

    # Weekday pattern
    data[weekday_mask] = np.random.normal(0.357, 0.1, size=weekday_mask.sum())
    
    # Weekend pattern
    data[weekend_mask] = np.random.normal(0.0, 0.01, size=weekend_mask.sum())

    # Morning peak (3 AM to 7 AM)
    morning_peak = (hour >= 3) & (hour < 7)
    data[morning_peak] *= np.random.uniform(1.2, 2.2, size=morning_peak.sum())

    # Afternoon peak (1 PM to 3 PM)
    afternoon_peak = (hour >= 13) & (hour < 15)
    data[afternoon_peak] *= np.random.uniform(1.2, 2.2, size=afternoon_peak.sum())

    # Early morning and late evening lows
    low_usage = ((hour >= 4) & (hour < 6)) | ((hour >= 17) & (hour < 19))
    data[low_usage] *= np.random.uniform(0.5, 0.8, size=low_usage.sum())

    # Ensure all values are non-negative and cap at a reasonable maximum
    data = np.clip(data, 0, 1)

    return data


    
epoch_settings = {
    'type1': 300,
    'PC': 300,
    'microwave': 300,
    'water':300,
    'printer':300,
    'coffee':300
}


def save_to_csv(df, filename):
    # Convert the index to the desired string format
    df.index = df.index.strftime('%Y-%m-%d %H:%M:%S+04:00')
    df.to_csv(filename)



if __name__ == "__main__":
    appliances = {
            'type1': ['Fridge-Energy_Filled.csv', 'Kettle-Energy_Filled.csv'],
            'PC': ['PC1-Energy_Filled.csv'],
            'microwave': ['Microwave-Energy_Filled.csv'],
            'printer': ['Printer-Energy_Filled.csv'],
            'water':['Water-Energy_Filled.csv'],
            'coffee':['Coffee-Energy_Filled.csv']
        }
    for appliance_type, file_list in appliances.items():
        for file_name in file_list:
            file_path = f"/kaggle/input/filled/{file_name}"
            appliance_name = file_name.split('-')[0]
            
            preprocessed = preprocess_data(file_path, appliance_type)
            data = preprocessed['data']
            energy_scaler = preprocessed['energy_scaler']
            time_scaler = preprocessed['time_scaler']

            X = prepare_sequences(data)
          

            seq_len = 6
            n_features = data.shape[1]
            hidden_dim = 256
            learning_rate = 0.0001

            timegan = TimeGAN(seq_len, n_features, hidden_dim, appliance_type)
            timegan.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
            
         
            
            epochs = epoch_settings[appliance_type]
            
            batch_size = 128
            steps_per_epoch = len(X) // batch_size

            for epoch in range(epochs):
                g_losses, d_losses, e_losses, r_losses, p_losses, peak_losses = [], [], [], [], [], []
                for step in range(steps_per_epoch):
                    idx = np.random.randint(0, len(X), batch_size)
                    batch_data = X[idx]
                    g_loss, d_loss, e_loss, r_loss, p_loss, peak_loss = timegan.train_step(batch_data)

                    if np.isnan([g_loss, d_loss, e_loss, r_loss, p_loss, peak_loss]).any():
                        print(f"NaN detected at epoch {epoch}, step {step}. Skipping this batch.")
                        continue

                    g_losses.append(g_loss)
                    d_losses.append(d_loss)
                    e_losses.append(e_loss)
                    r_losses.append(r_loss)
                    p_losses.append(p_loss)
                    peak_losses.append(peak_loss)

                if (epoch + 1) % 50 == 0:
                    print(f'Epoch {epoch+1}, G_loss: {np.mean(g_losses):.4f}, D_loss: {np.mean(d_losses):.4f}, '
                          f'E_loss: {np.mean(e_losses):.4f}, R_loss: {np.mean(r_losses):.4f}, '
                          f'P_loss: {np.mean(p_losses):.4f}, Peak_loss: {np.mean(peak_losses):.4f}')

            # Generate synthetic data up to 31-12-2024
            end_date = pd.Timestamp('2024-12-31 23:59:59')
            original_duration = data.index[-1] - data.index[0]
            synthetic_duration = end_date - data.index[-1]
            n_samples = max(len(data), int(len(data) * (synthetic_duration / original_duration)))

         
            random_noise = tf.random.normal([n_samples, seq_len, n_features], dtype=tf.float32)
            synthetic_data = timegan.generator(random_noise, training=False).numpy()

            synthetic_dates = pd.date_range(start=data.index[-1] + pd.Timedelta(seconds=1), 
                                            end=end_date, 
                                            periods=n_samples)

            synthetic_df = pd.DataFrame(synthetic_data[:, 0, :], 
                                        columns=data.columns,
                                        index=synthetic_dates)

            # Inverse transform the data
            synthetic_df['kWh.mean_value'] = energy_scaler.inverse_transform(synthetic_df[['kWh.mean_value']])
            synthetic_df[['day_of_week', 'hour', 'is_weekend', 'month', 'season']] = time_scaler.inverse_transform(synthetic_df[['day_of_week', 'hour', 'is_weekend', 'month', 'season']])

            # Appliance-specific post-processing
            original_data = pd.DataFrame(energy_scaler.inverse_transform(data[['kWh.mean_value']]), 
                                         columns=['kWh.mean_value'], 
                                         index=data.index)
            synthetic_df = post_process_appliance(synthetic_df, appliance_type, energy_scaler, original_data)

            # Combine original and synthetic data
            combined_df = pd.concat([original_data, synthetic_df[['kWh.mean_value']]])

            # Plot combined data
            plt.figure(figsize=(20, 8))
            plt.plot(combined_df.index, combined_df['kWh.mean_value'], label='Combined Data', alpha=0.7)
            plt.axvline(x=data.index[-1], color='r', linestyle='--', label='Original Data End')
            plt.title(f'Combined Original and Synthetic Energy Consumption Data - {appliance_name}')
            plt.xlabel('Date')
            plt.ylabel('kWh')
            plt.legend()
            plt.tight_layout()
            plt.savefig(f'{appliance_name}_combined_data.png')
            
            
       
            
            
            plt.show()

            # In the main loop where you save the combined data:
            output_file = f'{appliance_name}_augmented_data.csv'
            save_to_csv(combined_df, output_file)
            print(f"Combined data saved to {output_file}")

            # Original vs. Synthetic Plot
            plt.figure(figsize=(14, 6))
            plt.plot(original_data.index, original_data['kWh.mean_value'], label='Original Data')
            plt.plot(synthetic_df.index[:len(original_data)], synthetic_df['kWh.mean_value'][:len(original_data)], label='Synthetic Data', linestyle='--')
            plt.xlabel('Date')
            plt.ylabel('kWh')
            plt.title(f'Original vs. Synthetic Energy Consumption - {appliance_name}')
            plt.legend()
            plt.savefig(f'{appliance_name}_original_vs_synthetic.png')
            plt.show()

            print(f"Completed processing for {appliance_name}\n")

    print("All appliances processed successfully.")