In [1]:
#Importacion de librerias necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, Normalizer, LabelEncoder
from sklearn.compose import ColumnTransformer

#ajustes generales
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
#Filtrado de datos

#Lectura del dataset ya descargador y descomprimido
df = pd.read_csv('./Datasets/meteorite-landings.csv')

#Filtrar años validos (860 dC - 2016 dC)
df = df[(df['year'] >= 860) & (df['year'] <= 2016)]

#Filtrar coordenadas validas
df = df[(df['reclong'].between(-180, 180)) & ((df['reclat'] != 0) | (df['reclong'] != 0))]

#Eliminar filas con datos nulos en las columnas de interes y reseterar indices
df = df.dropna(subset=['year', 'reclat', 'reclong', 'mass', 'recclass'])  
df.reset_index(drop=True, inplace=True)

#Mostrar resultados
print(f'Dimensiones finales {df.shape}')
df.head()

Dimensiones finales (31929, 10)


Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775000, 6.083330)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.216670, -113.000000)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.883330, -99.900000)"
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.166670, -64.950000)"


In [3]:
#Preparacion de datos, para el modelo de clasificación

#Seleccionar caracteristicas y objetivo (útiles para el modelo)
features = df[['nametype', 'fall', 'mass', 'reclat', 'reclong', 'year']]
targets = df['recclass']

#type a one-hot, mass a min-max scaling, fall a one-hot, year a estadarizacion, reclat y reclonng normalizacion
cathegorical_features = ['nametype', 'fall']
numerical_features = ['mass', 'year']
coord_features = ['reclat', 'reclong']

#Crear el transformador de columnas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), cathegorical_features),
        ('num', MinMaxScaler(), numerical_features),
        ('coord', Normalizer(), coord_features)
    ])
#codificar etiquetas de salida
label_encoder = LabelEncoder()

#Aplicar las transformaciones a las caracteristicas y las etiquetas
X = preprocessor.fit_transform(features)
y = label_encoder.fit_transform(targets)
print(f'Caracteristicas transformadas: {X.shape}, Objetivos: {y.shape}')

Caracteristicas transformadas: (31929, 8), Objetivos: (31929,)


In [4]:
#Dividir los datos en datos de entrenamiento, validacion y testeo
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)
print(f'Train set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}')

Train set: (19157, 8), Validation set: (6386, 8), Test set: (6386, 8)


In [5]:
#Creacion de los datasets y dataloaders de pytorch
class MeteoriteDatset(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
    
train_dataset = MeteoriteDatset(X_train, y_train)
val_dataset = MeteoriteDatset(X_val, y_val)
test_dataset = MeteoriteDatset(X_test, y_test)

#Creacion de los dataloaders
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [6]:
#Creación de un modelo de red neuronal
class MeteiriteModule(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.l1 = nn.Linear(input_dim, 128)
        self.a1 = nn.ReLU()
        self.l2 = nn.Linear(128, 64)
        self.a2 = nn.ReLU()
        self.l3 = nn.Linear(64, output_dim)

        self.modules_list = [self.l1, self.a1, self.l2, self.a2, self.l3]
    
    def forward(self, x):
        for mod in self.modules_list:
            x = mod(x)
        return x
    
    def predict(self, x):
        self.eval()
        with torch.no_grad():
            logits = self.forward(x)
            preds = torch.argmax(logits, dim=1) #no es necesario softmax
        return preds


input_dim = X_train.shape[1]
output_dim = len(set(y)) #numero de clases únicas

model = MeteiriteModule(input_dim, output_dim)
model

MeteiriteModule(
  (l1): Linear(in_features=8, out_features=128, bias=True)
  (a1): ReLU()
  (l2): Linear(in_features=128, out_features=64, bias=True)
  (a2): ReLU()
  (l3): Linear(in_features=64, out_features=392, bias=True)
)

In [None]:
#Definir optimizador y funcion de perdida
loss_fn = nn.CrossEntropyLoss() #aplica log_softmax y nll_loss
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

In [None]:
#Funcion de entrenamiento y validacion
model.to(DEVICE)
def train_model(model, train_dl, val_dl, loss_fn, optimizer, epochs=120):
    train_accuracies, val_accuracies = [], []
    train_losses, val_losses = [], []

    for epoch in range(1, epochs + 1):
        # ----- Entrenamiento -----
        model.train()
        epoch_loss = 0
        correct = 0
        total = 0

        for x_batch, y_batch in train_dl:
            x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)

            optimizer.zero_grad() #limpiar gradientes
            logits = model(x_batch) #forward pass
            loss = loss_fn(logits, y_batch) #calcular perdida
            loss.backward() #backward pass
            optimizer.step() #actualizar pesos

            #Acumular perdida y aciertos
            epoch_loss += loss.item() * x_batch.size(0)
            correct += (logits.argmax(1) == y_batch).sum().item() #aciertos
            total += y_batch.size(0)

        train_losses.append(epoch_loss / total)
        train_accuracies.append(correct / total)

        # ----- Validación -----
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad(): #no calcular gradientes
            for x_batch, y_batch in val_dl:
                x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)

                logits = model(x_batch)
                loss = loss_fn(logits, y_batch)

                val_loss += loss.item() * x_batch.size(0)
                val_correct += (logits.argmax(1) == y_batch).sum().item()
                val_total += y_batch.size(0)

        val_losses.append(val_loss / val_total)
        val_accuracies.append(val_correct / val_total)

        # ----- Print cada 20 epochs -----
        if epoch % 20 == 0:
            print(f"Epoch {epoch}/{epochs} | "
                  f"Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_accuracies[-1]:.4f} | "
                  f"Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}")

    return train_losses, train_accuracies, val_losses, val_accuracies

#Entrenar el modelo
train_losses, train_accuracies, val_losses, val_accuracies = train_model(
    model, train_dl, val_dl, loss_fn, optimizer, epochs=200)

Epoch 10/200 | Train Loss: 2.7242, Train Acc: 0.2286 | Val Loss: 2.8683, Val Acc: 0.2424
Epoch 20/200 | Train Loss: 2.6806, Train Acc: 0.2365 | Val Loss: 2.8884, Val Acc: 0.2418
Epoch 30/200 | Train Loss: 2.6420, Train Acc: 0.2433 | Val Loss: 2.9378, Val Acc: 0.2513
Epoch 40/200 | Train Loss: 2.5984, Train Acc: 0.2674 | Val Loss: 2.9356, Val Acc: 0.2197
Epoch 50/200 | Train Loss: 2.5602, Train Acc: 0.2879 | Val Loss: 2.9435, Val Acc: 0.2891
Epoch 60/200 | Train Loss: 2.5325, Train Acc: 0.2905 | Val Loss: 2.9532, Val Acc: 0.2510
Epoch 70/200 | Train Loss: 2.5118, Train Acc: 0.2959 | Val Loss: 2.9879, Val Acc: 0.2919
Epoch 80/200 | Train Loss: 2.4979, Train Acc: 0.2955 | Val Loss: 2.9666, Val Acc: 0.2922
Epoch 90/200 | Train Loss: 2.4951, Train Acc: 0.2961 | Val Loss: 3.0130, Val Acc: 0.2933
Epoch 100/200 | Train Loss: 2.4734, Train Acc: 0.3044 | Val Loss: 3.0135, Val Acc: 0.2953
Epoch 110/200 | Train Loss: 2.4674, Train Acc: 0.3058 | Val Loss: 3.0390, Val Acc: 0.3019
Epoch 120/200 | Tra

In [9]:
#Testear el modelo
model.eval()
test_correct = 0
test_total = 0

with torch.no_grad():
    for x_batch, y_batch in test_dl:
        x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)
        
        logits = model(x_batch)
        test_correct += (logits.argmax(1) == y_batch).sum().item()
        test_total += y_batch.size(0)

test_accuracy = test_correct / test_total
print(f"\nTest Accuracy: {test_accuracy:.4f}")


Test Accuracy: 0.2964
