In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import ks_2samp

In [2]:
#arquitectura neuronal
class HaloToGalaxyModel(nn.Module):
    def __init__(self, input_size=4, output_size=50, hidden_dim=64):
        super(HaloToGalaxyModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_dim)        
        self.fc2 = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)  
        return F.softmax(x, dim=1)

# Función para cargar datos desde un CSV
def load_data_from_csv(file_path):
    data = pd.read_csv(file_path)
    X = data.iloc[:, 5:10].values  
    #y = data.iloc[:, 12:16].values  
    y = data.iloc[:, 12].values  #esto sólo carga la masa, la gracia es que para cada modelo usar un única columa
    return X, y


class customLossYan(nn.Module):
    def __init__(self, quantiles):
        super(customLossYan, self).__init__()
        self.quantiles = quantiles

    def forward(self, y_true, y_pred):
        y_true_one_hot = F.one_hot(y_true, num_classes=y_pred.size(1)).float()
        losses = []
        for q in self.quantiles:
            errors = y_true_one_hot - y_pred
            losses.append(
                torch.max((q - 1) * errors, q * errors)
            )
        loss = torch.mean(torch.sum(torch.stack(losses, dim=2), dim=2))
        return loss


def ks_test_metric(y_true, y_pred):
    # Convertir a distribuciones acumuladas empíricas
    y_true = y_true.cpu().numpy()
    y_pred = torch.argmax(y_pred, dim=1).cpu().numpy()
    
    ks_statistic, p_value = ks_2samp(y_true, y_pred)
    return ks_statistic, p_value

In [3]:
#cargar datos
file_path = 'datasetcompleto.csv'  
X, y = load_data_from_csv(file_path)

In [4]:
k = 50 #división de bins, 50 es lo que dice el paper
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
#transformar etiquetas a bins para problema de clasificacion
bins = np.linspace(y.min(), y.max(), k + 1)
y_binned = np.digitize(y, bins) - 1
X = torch.tensor(X, dtype=torch.float32).to(device)
y = torch.tensor(y_binned, dtype=torch.long).to(device)

In [6]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)


hidden_dim = 100 #tamaño de las capas ocultas
num_epochs = 1000
early_stop_patience = 20
best_val_loss = float('inf')
epochs_no_improve = 0

model = HaloToGalaxyModel(X.shape[1], k, hidden_dim).to(device)
criterion = customLossYan(bins)  
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [7]:
for epoch in range(num_epochs):
    model.train()  
    optimizer.zero_grad() 
    outputs = model(X_train)     
    loss = criterion(y_train,outputs)
    loss.backward() 
    optimizer.step()     
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(y_val, val_outputs)

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'halo_to_galaxy_model.pth')
    else:
        epochs_no_improve += 1
        if epochs_no_improve == early_stop_patience:
            print('Early stopping!')
            break

torch.save(model.state_dict(), 'halo_to_galaxy_model.pth')


Epoch [1/1000], Loss: 0.9965, Val Loss: 0.9795
Epoch [2/1000], Loss: 0.9791, Val Loss: 0.9718
Epoch [3/1000], Loss: 0.9716, Val Loss: 0.9651
Epoch [4/1000], Loss: 0.9652, Val Loss: 0.9615
Epoch [5/1000], Loss: 0.9618, Val Loss: 0.9602
Epoch [6/1000], Loss: 0.9604, Val Loss: 0.9595
Epoch [7/1000], Loss: 0.9596, Val Loss: 0.9590
Epoch [8/1000], Loss: 0.9589, Val Loss: 0.9583
Epoch [9/1000], Loss: 0.9580, Val Loss: 0.9573
Epoch [10/1000], Loss: 0.9568, Val Loss: 0.9560
Epoch [11/1000], Loss: 0.9551, Val Loss: 0.9545
Epoch [12/1000], Loss: 0.9535, Val Loss: 0.9539
Epoch [13/1000], Loss: 0.9528, Val Loss: 0.9538
Epoch [14/1000], Loss: 0.9526, Val Loss: 0.9537
Epoch [15/1000], Loss: 0.9526, Val Loss: 0.9537
Epoch [16/1000], Loss: 0.9526, Val Loss: 0.9537
Epoch [17/1000], Loss: 0.9526, Val Loss: 0.9537
Epoch [18/1000], Loss: 0.9526, Val Loss: 0.9537
Epoch [19/1000], Loss: 0.9526, Val Loss: 0.9537
Epoch [20/1000], Loss: 0.9526, Val Loss: 0.9537
Epoch [21/1000], Loss: 0.9526, Val Loss: 0.9537
E

In [8]:
model.load_state_dict(torch.load('halo_to_galaxy_model.pth'))

model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_test).sum().item() / y_test.size(0)
    print(f'Accuracy: {accuracy * 100:.2f}%')

    # KS test
    ks_statistic, p_value = ks_test_metric(y_test, outputs)
    print(f'KS Statistic: {ks_statistic:.4f}, P-value: {p_value:.4f}')

Accuracy: 6.60%
KS Statistic: 0.9340, P-value: 0.0000
