In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the data
train_data = pd.read_csv(r"C:\project\Anamoly Detection\Train.csv", encoding="latin-1")
test_data = pd.read_csv(r"C:\project\Anamoly Detection\Test.csv", encoding='latin-1')

# Data Cleaning
train_data['maxUE_UL+DL'] = train_data['maxUE_UL+DL'].replace('#¡VALOR!', np.nan)
train_data['maxUE_UL+DL'] = pd.to_numeric(train_data['maxUE_UL+DL'], errors='coerce')
train_data['maxUE_UL+DL'] = train_data['maxUE_UL+DL'].fillna(train_data['maxUE_UL+DL'].median())
train_data['maxUE_UL+DL'] = train_data['maxUE_UL+DL'].astype('int64')

# Fill missing values in the training and testing data
train_data.fillna(train_data.median(), inplace=True)
test_data.fillna(test_data.median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
train_data['CellName_encoded'] = le.fit_transform(train_data['CellName'])
test_data['CellName_encoded'] = le.transform(test_data['CellName'])

# Convert 'Time' to datetime and extract features
train_data['Time'] = pd.to_datetime(train_data['Time'], format='%H:%M')
test_data['Time'] = pd.to_datetime(test_data['Time'], format='%H:%M')

train_data['Hour'] = train_data['Time'].dt.hour
train_data['DayOfWeek'] = train_data['Time'].dt.dayofweek
test_data['Hour'] = test_data['Time'].dt.hour
test_data['DayOfWeek'] = test_data['Time'].dt.dayofweek

# Define the features and labels
features = train_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
features.remove('Unusual')
features += ['CellName_encoded', 'Hour', 'DayOfWeek']

X_train = train_data[features]
y_train = train_data['Unusual']
X_test = test_data[features]

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the Generator model
def build_generator(input_dim, output_dim):
    model = tf.keras.Sequential([
        layers.Dense(512, input_dim=input_dim, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(1024, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(2048, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(output_dim, activation='sigmoid')  #sigmoid for scaled output
    ])
    return model


# Define the Discriminator model with LeakyReLU and L2 regularization
def build_discriminator(input_shape):
    model = tf.keras.Sequential([
        layers.Dense(1024, input_shape=input_shape, kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.4),
        layers.Dense(512, kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.4),
        layers.Dense(256, kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.LeakyReLU(alpha=0.2),
        layers.Dropout(0.4),
        layers.Dense(1, activation='sigmoid')  # Sigmoid for binary classification
    ])
    return model

# Build the models
discriminator = build_discriminator((X_train_scaled.shape[1],))
discriminator.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

generator = build_generator(input_dim=100, output_dim=X_train_scaled.shape[1])

# Combined GAN model (Generator + Discriminator)
discriminator.trainable = False
gan_input = layers.Input(shape=(100,))
generated_data = generator(gan_input)
gan_output = discriminator(generated_data)
gan = tf.keras.models.Model(gan_input, gan_output)

gan.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002), loss='binary_crossentropy')

# Train the GAN
def train_gan(gan, generator, discriminator, X_train, epochs=200, batch_size=64):
    for epoch in range(epochs):
        # Train Discriminator
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        real_data = X_train[idx]
        noise = np.random.normal(0, 1, (batch_size, 100))
        fake_data = generator.predict(noise)
        
        d_loss_real = discriminator.train_on_batch(real_data, np.ones((batch_size, 1)))
        d_loss_fake = discriminator.train_on_batch(fake_data, np.zeros((batch_size, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # Train Generator
        noise = np.random.normal(0, 1, (batch_size, 100))
        valid_y = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch(noise, valid_y)
        
        # Print the progress
        if epoch % 10 == 0:
            print(f"{epoch}/{epochs} | Discriminator Loss: {d_loss[0]:.4f}, Accuracy: {d_loss[1]:.4f} | Generator Loss: {g_loss:.4f}")

# Start training the GAN
train_gan(gan, generator, discriminator, X_train_scaled, epochs=1000, batch_size=64)

# Use the trained discriminator for anomaly detection
threshold = 0.5  # You can adjust this threshold as needed
predictions = discriminator.predict(X_test_scaled)
anomalies = predictions < threshold

# Output the results
anomalous_data = test_data[anomalies.flatten()]
print("Detected anomalies:")
print(anomalous_data)


  train_data.fillna(train_data.median(), inplace=True)
  test_data.fillna(test_data.median(), inplace=True)


0/1000 | Discriminator Loss: 11.2152, Accuracy: 0.5391 | Generator Loss: 11.1111
10/1000 | Discriminator Loss: 9.6664, Accuracy: 0.9141 | Generator Loss: 10.0951
20/1000 | Discriminator Loss: 8.3627, Accuracy: 0.9297 | Generator Loss: 9.1749
30/1000 | Discriminator Loss: 7.1995, Accuracy: 0.9844 | Generator Loss: 8.2759
40/1000 | Discriminator Loss: 6.2222, Accuracy: 1.0000 | Generator Loss: 7.6970
50/1000 | Discriminator Loss: 5.4128, Accuracy: 0.9688 | Generator Loss: 7.1162
60/1000 | Discriminator Loss: 4.6928, Accuracy: 0.9688 | Generator Loss: 6.4014
70/1000 | Discriminator Loss: 4.0694, Accuracy: 0.9766 | Generator Loss: 5.9969
80/1000 | Discriminator Loss: 3.5045, Accuracy: 1.0000 | Generator Loss: 6.2530
90/1000 | Discriminator Loss: 3.0900, Accuracy: 1.0000 | Generator Loss: 5.3191
100/1000 | Discriminator Loss: 2.7244, Accuracy: 0.9922 | Generator Loss: 5.4041
110/1000 | Discriminator Loss: 2.3694, Accuracy: 0.9766 | Generator Loss: 5.0354
120/1000 | Discriminator Loss: 2.170