In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import euclidean
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/IOT PE 6th semester/Till 5th meet/train.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["hour"] = df["timestamp"].dt.hour
df["day"] = df["timestamp"].dt.day
df["month"] = df["timestamp"].dt.month
df["year"] = df["timestamp"].dt.year

# Select features and target
features = ["building_id", "meter", "hour", "day", "month", "year"]
target = "meter_reading"

# Normalize data
scaler_x = StandardScaler()
scaler_y = StandardScaler()
x_data = scaler_x.fit_transform(df[features].values)
y_data = scaler_y.fit_transform(df[[target]].values)

# Convert to tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x_tensor = torch.FloatTensor(x_data).to(device)
y_tensor = torch.FloatTensor(y_data).to(device)

# Define Generator
class Generator(nn.Module):
    def __init__(self, input_dim, condition_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim + condition_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, output_dim)
        )

    def forward(self, noise, condition):
        x = torch.cat([noise, condition], dim=1)
        return self.model(x)

# Define Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_dim, condition_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim + condition_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def forward(self, data, condition):
        x = torch.cat([data, condition], dim=1)
        return self.model(x)

# Gradient penalty
def gradient_penalty(discriminator, real_data, fake_data, condition):
    alpha = torch.rand(real_data.size(0), 1, device=device)
    interpolates = (alpha * real_data + (1 - alpha) * fake_data).requires_grad_(True)
    d_interpolates = discriminator(interpolates, condition)
    gradients = torch.autograd.grad(outputs=d_interpolates, inputs=interpolates,
                                    grad_outputs=torch.ones_like(d_interpolates),
                                    create_graph=True, retain_graph=True, only_inputs=True)[0]
    return ((gradients.norm(2, dim=1) - 1) ** 2).mean()

# Training function
def train_gan(epochs=5000, batch_size=1024, lambda_gp=15):
    input_dim, condition_dim, output_dim = 1, x_tensor.shape[1], 1
    generator = Generator(input_dim, condition_dim, output_dim).to(device)
    discriminator = Discriminator(input_dim, condition_dim).to(device)

    optimizer_G = optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.9))
    optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.9))

    for epoch in range(epochs):
        for _ in range(7):  # More D updates per G update
            idx = np.random.randint(0, x_tensor.shape[0], batch_size)
            real_x, real_y = x_tensor[idx], y_tensor[idx]
            noise = torch.randn(batch_size, input_dim, device=device)
            fake_y = generator(noise, real_x).detach()

            optimizer_D.zero_grad()
            real_pred = discriminator(real_y, real_x) + 0.01 * torch.randn_like(real_y)  # Add noise to real data
            fake_pred = discriminator(fake_y, real_x)
            gp = gradient_penalty(discriminator, real_y, fake_y, real_x)
            d_loss = -torch.mean(real_pred) + torch.mean(fake_pred) + lambda_gp * gp
            d_loss.backward()
            optimizer_D.step()

        optimizer_G.zero_grad()
        noise = torch.randn(batch_size, input_dim, device=device)
        fake_y = generator(noise, real_x)
        fake_pred = discriminator(fake_y, real_x)
        g_loss = -torch.mean(fake_pred)
        g_loss.backward()
        optimizer_G.step()

        if epoch % 500 == 0:
            print(f"Epoch {epoch}, D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

    return generator

# Train GAN
generator = train_gan()

# Generate synthetic data
num_samples = len(x_tensor)
synthetic_y = generator(torch.randn(num_samples, 1, device=device), x_tensor).detach().cpu().numpy()
synthetic_y = scaler_y.inverse_transform(synthetic_y)

# Evaluation
real_values = y_tensor.cpu().numpy().flatten()
synthetic_values = synthetic_y.flatten()

print(f"Wasserstein Distance: {wasserstein_distance(real_values, synthetic_values):.4f}")
print(f"Frechet Distance: {euclidean([real_values.mean()], [synthetic_values.mean()]):.4f}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(real_values, synthetic_values):.4f}")
print(f"Mean Squared Error (MSE): {mean_squared_error(real_values, synthetic_values):.4f}")

# KDE plot
plt.figure(figsize=(10, 5))
sns.kdeplot(real_values, label="Real Data", fill=True, alpha=0.5, warn_singular=False)
sns.kdeplot(synthetic_values, label="Synthetic Data", fill=True, alpha=0.5, warn_singular=False)
plt.xlabel("Meter Reading")
plt.ylabel("Density")
plt.legend()
plt.title("Comparison of Real and Synthetic Meter Readings")
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

Epoch 0, D Loss: 8.6676, G Loss: 0.1803
Epoch 500, D Loss: 4.6302, G Loss: -5.0625
Epoch 1000, D Loss: 9.2659, G Loss: -6.1544
Epoch 1500, D Loss: 13.0292, G Loss: -4.6330
Epoch 2000, D Loss: -18.2859, G Loss: 31.8388
Epoch 2500, D Loss: -9.6847, G Loss: 14.1566
Epoch 3000, D Loss: -4.4446, G Loss: 3.8311
Epoch 3500, D Loss: -1.2415, G Loss: -4.1905


In [39]:
from scipy.stats import ks_2samp

# KS Test Evaluation
def ks_test_evaluation(real_data, synthetic_data):
    real_values = real_data.cpu().numpy().flatten()
    synthetic_values = synthetic_data.flatten()
    ks_stat, p_value = ks_2samp(real_values, synthetic_values)
    print(f"KS Statistic: {ks_stat:.4f}, P-value: {p_value:.4f}")
    if p_value > 0.05:
        print("The synthetic data follows a similar distribution to the real data (fail to reject H0).")
    else:
        print("The synthetic data significantly differs from the real data (reject H0).")

# Perform KS test
ks_test_evaluation(y_tensor, synthetic_y)

KS Statistic: 0.9912, P-value: 0.0000
The synthetic data significantly differs from the real data (reject H0).
