In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
train_data = pd.read_csv(r"/kaggle/input/cellular-anomaly/ML-MATT-CompetitionQT1920_train.csv", encoding="latin-1")
test_data = pd.read_csv(r"/kaggle/input/cellular-anomaly/ML-MATT-CompetitionQT1920_test.csv", encoding='latin-1')

In [None]:
train_data

In [None]:
train_data.info()

In [None]:
print(train_data['maxUE_UL+DL'].value_counts().get('#¡VALOR!', 0))

In [None]:
# Replace '#¡VALOR!' with NaN
train_data['maxUE_UL+DL'] = train_data['maxUE_UL+DL'].replace('#¡VALOR!', np.nan)

# Convert to numeric, coercing any remaining non-numeric values to NaN
train_data['maxUE_UL+DL'] = pd.to_numeric(train_data['maxUE_UL+DL'], errors='coerce')

# Check the number of NaN values
print(train_data['maxUE_UL+DL'].isna().sum())

In [None]:
median_value = train_data['maxUE_UL+DL'].median()
train_data['maxUE_UL+DL'] = train_data['maxUE_UL+DL'].fillna(median_value)

# Convert to int64
train_data['maxUE_UL+DL'] = train_data['maxUE_UL+DL'].astype('int64')

# Verify the change
print(train_data['maxUE_UL+DL'].dtype)

In [None]:
test_data.info()

In [None]:
print(train_data.isna().sum())

In [None]:
print(test_data.isna().sum())

In [None]:
numerical_columns = train_data.select_dtypes(include=['float64', 'int64']).columns

numerical_columns

In [None]:
# Identify numerical columns (excluding 'Unusual' from train_data)
train_numerical_columns = train_data.select_dtypes(include=['float64', 'int64']).columns.drop('Unusual')
test_numerical_columns = test_data.select_dtypes(include=['float64', 'int64']).columns

# Fill missing values with median for numerical columns
train_data[train_numerical_columns] = train_data[train_numerical_columns].fillna(train_data[train_numerical_columns].median())
test_data[test_numerical_columns] = test_data[test_numerical_columns].fillna(test_data[test_numerical_columns].median())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode 'CellName'
le = LabelEncoder()
train_data['CellName_encoded'] = le.fit_transform(train_data['CellName'])
test_data['CellName_encoded'] = le.transform(test_data['CellName'])

# Convert 'Time' to datetime and extract features
train_data['Time'] = pd.to_datetime(train_data['Time'], format='%H:%M')
test_data['Time'] = pd.to_datetime(test_data['Time'], format='%H:%M')

train_data['Hour'] = train_data['Time'].dt.hour
train_data['DayOfWeek'] = train_data['Time'].dt.dayofweek
test_data['Hour'] = test_data['Time'].dt.hour
test_data['DayOfWeek'] = test_data['Time'].dt.dayofweek

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data[train_numerical_columns] = scaler.fit_transform(train_data[train_numerical_columns])
test_data[test_numerical_columns] = scaler.transform(test_data[test_numerical_columns])

In [None]:
features = train_numerical_columns.tolist() + ['CellName_encoded', 'Hour', 'DayOfWeek']
X_train = train_data[features]
y_train = train_data['Unusual']
X_test = test_data[features]

In [None]:
#usual_count
test_data[test_data['Anomaly']==0]['Anomaly'].count()

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train.values)
X_test_tensor = torch.FloatTensor(X_test.values)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor)
test_dataset = TensorDataset(X_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
import torch.nn as nn

class GANomaly(nn.Module):
    def _init_(self, input_dim):
        super(GANomaly, self)._init_()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )
        
        self.discriminator = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        y = self.discriminator(x_hat)
        return x_hat, y, z

# Initialize the model
input_dim = X_train.shape[1]
model = GANomaly(input_dim)


In [None]:
import torch.optim as optim

# Define loss functions and optimizer
criterion_mse = nn.MSELoss()
criterion_bce = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs = batch[0]
        
        # Forward pass
        x_hat, y, z = model(inputs)
        
        # Compute losses
        loss_con = criterion_mse(inputs, x_hat)
        loss_enc = criterion_mse(z, model.encoder(x_hat))
        loss_adv = criterion_bce(y, torch.ones(y.size(0), 1))
        
        loss = loss_con + loss_enc + loss_adv
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


In [None]:
import numpy as np

model.eval()
anomaly_scores = []
with torch.no_grad():
    for batch in test_loader:
        inputs = batch[0]
        x_hat, _, _ = model(inputs)
        anomaly_score = torch.mean((inputs - x_hat)**2, dim=1)
        anomaly_scores.extend(anomaly_score.numpy())

# Convert anomaly scores to probabilities
anomaly_probs = 1 - np.exp(-np.array(anomaly_scores))

# Add predictions to the test data
test_data['GANomaly_Score'] = anomaly_scores
test_data['GANomaly_Probability'] = anomaly_probs


# Display top anomalies
top_anomalies = test_data.sort_values('GANomaly_Probability', ascending=False).head(10)

# Select columns to display
display_columns = ['Time', 'CellName', 'GANomaly_Probability'] + test_numerical_columns.tolist()

# Function to format float values
def format_float(x):
    return f"{x:.4f}" if isinstance(x, (float, np.float64)) else x

# Display top anomalies with formatted values
print(top_anomalies[display_columns].applymap(format_float))

# Calculate and display average values for comparison
avg_values = test_data[test_numerical_columns].mean()
print("\nAverage values for comparison:")
print(avg_values.apply(format_float))