In [1]:
import gzip
import requests
from io import BytesIO, StringIO
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

import mlflow
import mlflow.pytorch

import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc

In [2]:
GITHUB_CLEAN_URL = "https://raw.githubusercontent.com/Bootcamp-IA-P4/project-x-nlp-team-3/feature/models/Data/fusion30.csv"

def load_comments_data_from_github(url):
    """
    Downloading and processing comments data from GitHub repository.
    """
    print("🔗 Downloading data from GitHub...")

    try:
        response = requests.get(url)
        response.raise_for_status()

        print("📊 Reading CSV file...")

        df = pd.read_csv(StringIO(response.text), sep=';')

        print("✅ Data downloaded successfully!")
        return df

    except Exception as e:
        print(f"❌ Error while downloading data: {e}")
        return None

# Creating dataframe from GitHub URL
df = load_comments_data_from_github(GITHUB_CLEAN_URL)

🔗 Downloading data from GitHub...
📊 Reading CSV file...
✅ Data downloaded successfully!


Definir la red neuronal

In [3]:
# Procesar el texto usando TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Limitamos a 1000 características
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label']

# Normalizar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convertir a tensores de PyTorch
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.LongTensor(y_train.values)
y_test = torch.LongTensor(y_test.values)

In [4]:
# Definir la red neuronal
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate=0.2):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim//2)
        self.fc3 = nn.Linear(hidden_dim//2, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.batch_norm1 = nn.BatchNorm1d(hidden_dim)
        self.batch_norm2 = nn.BatchNorm1d(hidden_dim//2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = self.batch_norm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc3(x)
        return x

In [5]:
# Configurar hiperparámetros
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = len(torch.unique(y_train))
epochs = 50
batch_size = 32
learning_rate = 0.001
dropout_rate = 0.2

In [6]:
# Crear data loaders
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Configurar MLflow antes de iniciar el experimento
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Neural_Network_Model")

<Experiment: artifact_location='mlflow-artifacts:/919852534155930550', creation_time=1751878783567, experiment_id='919852534155930550', last_update_time=1751878783567, lifecycle_stage='active', name='Neural_Network_Model', tags={}>

In [7]:
# Entrenar el modelo
with mlflow.start_run(run_name="PyTorch_NN_Training"):
    # Registrar hiperparámetros
    mlflow.log_params({
        "input_dim": input_dim,
        "hidden_dim": hidden_dim,
        "output_dim": output_dim,
        "epochs": epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "dropout_rate": dropout_rate
    })
    
    # Inicializar modelo, criterio y optimizador
    model = SimpleNN(input_dim, hidden_dim, output_dim, dropout_rate)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Entrenamiento
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Evaluación
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train)
            train_preds = torch.argmax(train_outputs, dim=1)
            train_acc = accuracy_score(y_train, train_preds)
            
            test_outputs = model(X_test)
            test_preds = torch.argmax(test_outputs, dim=1)
            test_acc = accuracy_score(y_test, test_preds)
        
        # Registrar métricas
        avg_loss = total_loss / len(train_loader)
        mlflow.log_metrics({
            "train_loss": avg_loss,
            "train_accuracy": train_acc,
            "test_accuracy": test_acc
        }, step=epoch)
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}]")
            print(f"Loss: {avg_loss:.4f}")
            print(f"Train Accuracy: {train_acc:.4f}")
            print(f"Test Accuracy: {test_acc:.4f}\n")


Epoch [10/50]
Loss: 0.2075
Train Accuracy: 0.9685
Test Accuracy: 0.7357

Epoch [20/50]
Loss: 0.1305
Train Accuracy: 0.9865
Test Accuracy: 0.7312

Epoch [30/50]
Loss: 0.1003
Train Accuracy: 0.9907
Test Accuracy: 0.7341

Epoch [40/50]
Loss: 0.0876
Train Accuracy: 0.9915
Test Accuracy: 0.7319

Epoch [50/50]
Loss: 0.0820
Train Accuracy: 0.9928
Test Accuracy: 0.7327

🏃 View run PyTorch_NN_Training at: http://localhost:5000/#/experiments/919852534155930550/runs/9c75a722537242a78dec77e2166056f5
🧪 View experiment at: http://localhost:5000/#/experiments/919852534155930550


In [8]:
# Evaluación final del modelo
model.eval()
with torch.no_grad():
    # Métricas de entrenamiento
    train_outputs = model(X_train)
    train_preds = torch.argmax(train_outputs, dim=1)
    train_acc = accuracy_score(y_train, train_preds)
    train_report = classification_report(y_train, train_preds)
    
    # Métricas de prueba
    test_outputs = model(X_test)
    test_preds = torch.argmax(test_outputs, dim=1)
    test_acc = accuracy_score(y_test, test_preds)
    test_report = classification_report(y_test, test_preds)

    # Imprimir métricas por pantalla
    print("\nMétricas de Entrenamiento:")
    print(f"Accuracy: {train_acc:.4f}")
    print("\nReporte de Clasificación (Entrenamiento):")
    print(train_report)
    
    print("\nMétricas de Prueba:")
    print(f"Accuracy: {test_acc:.4f}")
    print("\nReporte de Clasificación (Prueba):")
    print(test_report)
    
    # Registrar métricas en MLflow
    mlflow.log_metric("final_train_accuracy", train_acc)
    mlflow.log_metric("final_test_accuracy", test_acc)
    
    # Guardar los reportes de clasificación
    mlflow.log_text(train_report, "train_classification_report.txt")
    mlflow.log_text(test_report, "test_classification_report.txt")
    
    # Calcular y registrar métricas adicionales por clase
    from sklearn.metrics import precision_recall_fscore_support
    
    # Métricas detalladas para conjunto de prueba
    precision, recall, f1, support = precision_recall_fscore_support(y_test, test_preds)
    
    for i, (p, r, f) in enumerate(zip(precision, recall, f1)):
        mlflow.log_metrics({
            f"class_{i}_precision": p,
            f"class_{i}_recall": r,
            f"class_{i}_f1": f
        })


Métricas de Entrenamiento:
Accuracy: 0.9928

Reporte de Clasificación (Entrenamiento):
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10856
           1       0.99      0.99      0.99     13120

    accuracy                           0.99     23976
   macro avg       0.99      0.99      0.99     23976
weighted avg       0.99      0.99      0.99     23976


Métricas de Prueba:
Accuracy: 0.7327

Reporte de Clasificación (Prueba):
              precision    recall  f1-score   support

           0       0.69      0.72      0.70      2646
           1       0.77      0.74      0.76      3348

    accuracy                           0.73      5994
   macro avg       0.73      0.73      0.73      5994
weighted avg       0.73      0.73      0.73      5994



In [10]:
# Importar pickle
import pickle

# Guardar el modelo en formato .pkl
with open('../Models/pytorch/pytorch_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# También guardar el vectorizador para futuros usos
with open('../Models/pytorch/pytorch_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)