<a href="https://colab.research.google.com/github/Alvarolo01/AI-Data-Optimization-Engine/blob/main/core_optimizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from typing import Dict, List, Optional
from datetime import datetime

class LLMDataOptimizer:
  """
  Motor de optimizacion para datasets de entrenamiento de IA.
  Implementa arquitectura limpia y procesamiento vectorizado para alto rendimiento.
  """
  def __init__(self, batch_id: str):
    self.batch_id = batch_id
    self.data: Optional[pd.DataFrame] = None
    self.metrics: Dict[str, List[float]] = {}

  def ingest_raw_data(self) -> None:
    """ Simulación de ingesta de datos de produccion. """
    raw_payload = {
        'prompt_id' : [f"ID_{i}" for i in range(100)],
        'response text' : [f"Sample response content{i}" for i in range(100)],
        'latency_ms' : np.random.normal(250, 50, 100),
        'accuracy_score': np.random.uniform(0.1, 1.0, 100)
    }

    self.data = pd.DataFrame(raw_payload)
    print(f"✅ Batch {self.batch_id}: 100 registros cargados.")

  def apply_quality_logic(self) -> pd.DataFrame:
    """
    Lógica de negocio: Clasificación de calidad mediante lógica booleana vectorizada
    Demuestra eficiencia algorítmica  y evitamiento de bucles 'for' innecesarios.
    """
    if self.data is None:
      raise ValueError("No hay datos para procesar. Cargue datos primero.")

    # Definición de criterios de rendimiento
    conditions = [
      (self.data['accuracy_score'] >= 0.85) & (self.data['latency_ms'] < 200),
      (self.data['accuracy_score'] >= 0.70),
      (self.data['accuracy_score'] < 0.70)
    ]
    labels = ['GOLD_QUALITY', 'SILVER_QUALITY', 'REJECTED']

    self.data['evaluation_tag'] = np.select(conditions, labels, default='PENDING')
    return self.data


if __name__ == "__main__":
  optimizer = LLMDataOptimizer(batch_id = "PROD_NOV_2024")
  optimizer.ingest_raw_data()
  processed_df = optimizer.apply_quality_logic()
  print(processed_df.head())

✅ Batch PROD_NOV_2024: 100 registros cargados.
  prompt_id             response text  latency_ms  accuracy_score  \
0      ID_0  Sample response content0  249.338999        0.798135   
1      ID_1  Sample response content1  300.765651        0.734387   
2      ID_2  Sample response content2  259.436571        0.618169   
3      ID_3  Sample response content3  328.589720        0.422492   
4      ID_4  Sample response content4  225.817619        0.934432   

   evaluation_tag  
0  SILVER_QUALITY  
1  SILVER_QUALITY  
2        REJECTED  
3        REJECTED  
4  SILVER_QUALITY  
