In [6]:
import pandas as pd

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import re

In [7]:
test = pd. read_csv('test.csv')
test_clean = test.copy()

In [8]:
def clean_columns(df):
    #COLUMNA COMPANY - No hay que hacer cambios, directamente hacer onehot
    #COLUMNA PRODUCT - Hecha en la siguiente celda.
    #COLUMNA TYPENAME - No hay que hacer cambios, directamente hacer onehot 
    #COLUMNA INCHES - No hay que hacer cambios, directamente hacer onehot 
    #COLUMNA SCREENRESOLUTION - Nueva conlumna para extraer resolución de pantalla y separo por la altura y el ancho, genero nueva variable de ratio
    test_clean['ScreenResolutionNumber'] = test_clean['ScreenResolution'].str.extract(r'(\d+x\d+)')[0]
    test_clean[['ResWidth', 'ResHeight']] = test_clean['ScreenResolutionNumber'].str.split('x', expand=True).astype(float)
    test_clean['AspectRatio'] = test_clean['ResWidth'] / test_clean['ResHeight']
    #Nuevas columnas para estraer las palabras claves
    test_clean['Touchscreen'] = test_clean['ScreenResolution'].str.contains('Touchscreen', case=False).astype(int)
    test_clean['IPS'] = test_clean['ScreenResolution'].str.contains('IPS Panel', case=False).astype(int)
    test_clean['Retina'] = test_clean['ScreenResolution'].str.contains('Retina', case=False).astype(int)
    test_clean['4K'] = test_clean['ScreenResolution'].str.contains('4K|Ultra HD', case=False).astype(int)
    test_clean['FullHD'] = test_clean['ScreenResolution'].str.contains('Full HD', case=False).astype(int)
    test_clean['QuadHD'] = test_clean['ScreenResolution'].str.contains('Quad HD', case=False).astype(int)
    #COLUMNA CPU - Hecha más abajo
    #COLUMNA GPU - Hecha más abajo
    #COLUMNA RAM  - Quito el GB de los valores y lo convierto a numerico
    test_clean['Ram'] = test_clean['Ram'].str.replace('GB', '').astype(int)
    #COLUMNA MEMORY
    #COLUMNA OPSYS - No hay que hacer nada, directamente onehot
    #COLUMNA WEIGHT - Quitamos el kg de los valores y convertimos a numerico
    test_clean['Weight'] = test_clean['Weight'].str.replace('kg', '').astype(float)




In [9]:
#Columna producto - agrego nuevas columnas para intentar sacar algo en claro. 

# Extraer solo la parte antes del primer paréntesis (o todo si no hay paréntesis)
test_clean['ProductBase'] = test_clean['Product'].apply(
    lambda x: x.split('(')[0].strip() if '(' in x else x.strip())
# Limpiar espacios extra y estandarizar nombres similares
test_clean['ProductBase'] = test_clean['ProductBase'].str.replace('  ', ' ')
test_clean['ProductBase'] = test_clean['ProductBase'].str.strip()

# Estandarizar nombres (ej: 'Rog Strix' y 'ROG Strix' -> mismo)
test_clean['ProductBase'] = test_clean['ProductBase'].str.title()

#Genero nueva columna para separar en familias de producto
product_families = {
    # Gaming
    'Rog': 'Gaming', 'Omen': 'Gaming', 'Predator': 'Gaming',
    'Legion': 'Gaming', 'Alienware': 'Gaming', 'Gt': 'Gaming',
    'Gs': 'Gaming', 'Ge': 'Gaming', 'Gl': 'Gaming', 'Gp': 'Gaming',
    
    # Business/Pro
    'Thinkpad': 'Business', 'Thinkpad': 'Business', 
    'Elitebook': 'Business', 'Probook': 'Business',
    'Latitude': 'Business', 'Precision': 'Business',
    'Tecra': 'Business', 'Portege': 'Business', 'Zbook': 'Business',
    
    # Consumer
    'IdeaPad': 'Consumer', 'Inspiron': 'Consumer', 'Vostro': 'Consumer',
    'Aspire': 'Consumer', 'Pavilion': 'Consumer', 'Vivobook': 'Consumer',
    'Chromebook': 'Consumer', 'Yoga': 'Consumer', 'Envy': 'Consumer',
    
    # Ultrabooks/Premium
    'Xps': 'Premium', 'Spectre': 'Premium', 'Zenbook': 'Premium',
    'Macbook': 'Premium', 'Surface': 'Premium', 'Matebook': 'Premium',
    'Pixelbook': 'Premium'
}

def get_product_family(product_name):
    product_lower = product_name.lower()
    for key, family in product_families.items():
        if key.lower() in product_lower:
            return family
    return 'Other'

test_clean['ProductFamily'] = test_clean['ProductBase'].apply(get_product_family)

# Función para extraer specs del string entre paréntesis
def extract_specs_from_parentheses(product_string):
    specs = {
        'has_specs_in_name': 0,
        'cpu_in_name': 0,
        'ram_in_name': 0,
        'storage_in_name': 0,
        'gpu_in_name': 0
    }
    
    if '(' in product_string:
        specs['has_specs_in_name'] = 1
        inside = product_string.split('(')[1].split(')')[0].lower()
        
        # Buscar componentes específicos
        if any(cpu_term in inside for cpu_term in ['i3', 'i5', 'i7', 'i9', 'ryzen', 'a6', 'a9', 'e2']):
            specs['cpu_in_name'] = 1
        if 'gb' in inside:
            specs['ram_in_name'] = 1
        if any(storage in inside for storage in ['tb', 'gb', 'ssd', 'hdd']):
            specs['storage_in_name'] = 1
        if any(gpu in inside for gpu in ['radeon', 'geforce', 'quadro', 'gpu']):
            specs['gpu_in_name'] = 1
    
    return pd.Series(specs)

# Aplicar la función
specs_features = test_clean['Product'].apply(extract_specs_from_parentheses)
test_clean = pd.concat([test_clean, specs_features], axis=1)

# Para ProductBase: usar frecuencia encoding (mejor que one-hot con tantos valores únicos)
product_counts = test_clean['ProductBase'].value_counts()
test_clean['ProductBaseFreq'] = test_clean['ProductBase'].map(product_counts)

# Para ProductFamily: one-hot encoding (pocas categorías)
test_clean = pd.get_dummies(test_clean, columns=['ProductFamily'], prefix='Family')

# Opcional: ¿Es un modelo "con especificaciones en el nombre"? (indicador de gama baja/media)
test_clean['IsDetailedModel'] = test_clean['has_specs_in_name']

In [10]:
# COLUMNA CPU - 
# Extraer fabricante
test_clean['CpuManufacturer'] = test_clean['Cpu'].apply(
    lambda x: 'Intel' if 'Intel' in x else ('AMD' if 'AMD' in x else 'Other'))
# Extraer línea/serie (ej: Core i7, Ryzen, Celeron, etc.)
def extract_cpu_line(cpu_string):
    cpu_lower = cpu_string.lower()
    
    # Prioridad de búsqueda
    if 'core i7' in cpu_lower: return 'Core i7'
    elif 'core i5' in cpu_lower: return 'Core i5'
    elif 'core i3' in cpu_lower: return 'Core i3'
    elif 'xeon' in cpu_lower: return 'Xeon'
    elif 'ryzen' in cpu_lower: return 'Ryzen'
    elif 'pentium' in cpu_lower: return 'Pentium'
    elif 'celeron' in cpu_lower: return 'Celeron'
    elif 'core m' in cpu_lower: return 'Core M'
    elif 'atom' in cpu_lower: return 'Atom'
    elif 'a12' in cpu_lower: return 'A12'
    elif 'a10' in cpu_lower: return 'A10'
    elif 'a9' in cpu_lower: return 'A9'
    elif 'a8' in cpu_lower: return 'A8'
    elif 'a6' in cpu_lower: return 'A6'
    elif 'a4' in cpu_lower: return 'A4'
    elif 'e-series' in cpu_lower: return 'E-Series'
    else: return 'Other'

test_clean['CpuLine'] = test_clean['Cpu'].apply(extract_cpu_line)
# Extraer número de modelo (ej: 7500U de "Intel Core i7 7500U 2.7GHz")
test_clean['CpuModelNumber'] = test_clean['Cpu'].str.extract(r'(\d{4}[A-Z]*|\d{3}[A-Z]*)')[0]

# Extraer frecuencia en GHz (convertir a numérico)
test_clean['CpuFrequencyGHz'] = test_clean['Cpu'].str.extract(r'(\d+\.?\d*)GHz')[0].astype(float)

# Detectar si es de bajo voltaje (U) o alta performance (HQ, HK)
test_clean['CpuIsLowVoltage'] = test_clean['Cpu'].str.contains('U$|U\s', regex=True).astype(int)
test_clean['CpuIsHighPerformance'] = test_clean['Cpu'].str.contains('HQ|HK|H', regex=True).astype(int)
test_clean['CpuIsQuadCore'] = test_clean['Cpu'].str.contains('Quad', case=False).astype(int)
test_clean['CpuIsDualCore'] = test_clean['Cpu'].str.contains('Dual', case=False).astype(int)


def estimate_cpu_generation(cpu_string, model_number):
    """
    Estimar generación basado en número de modelo:
    6000 = 6th gen, 7000 = 7th gen, 8000 = 8th gen, etc.
    """
    if pd.isna(model_number):
        return None
    
    # Solo para Intel Core con modelo de 4 dígitos
    if isinstance(model_number, str) and model_number.isdigit() and len(model_number) >= 4:
        first_digit = int(model_number[0])
        return first_digit
    return None

test_clean['CpuGeneration'] = test_clean.apply(
    lambda row: estimate_cpu_generation(row['Cpu'], row['CpuModelNumber']), axis=1
)

# Mapeo de gama de CPU (basado en conocimiento de dominio)
cpu_tier_mapping = {
    'Xeon': 5,            # Workstation/Server
    'Core i7': 4,         # High-end
    'Core i5': 3,         # Mid-range
    'Ryzen': 4,           # High-end (AMD)
    'Core i3': 2,         # Entry-level
    'A12': 3,             # Mid-range (AMD)
    'A10': 2,             # Low-mid (AMD)
    'A9': 2,              # Low-mid (AMD)
    'A8': 2,              # Low (AMD)
    'A6': 1,              # Low (AMD)
    'A4': 1,              # Low (AMD)
    'Core M': 2,          # Low power
    'Pentium': 1,         # Budget
    'Celeron': 1,         # Budget
    'Atom': 1,            # Ultra low power
    'E-Series': 1,        # Very low end
    'Other': 1
}

test_clean['CpuTier'] = test_clean['CpuLine'].map(cpu_tier_mapping)

# Score compuesto de performance aproximada
test_clean['CpuPerformanceScore'] = (
    test_clean['CpuTier'] * 2 + 
    test_clean['CpuFrequencyGHz'].fillna(2.0) / 2 +
    test_clean['CpuIsHighPerformance'] * 1.5 -
    test_clean['CpuIsLowVoltage'] * 0.5
)

# One-hot encoding para líneas principales
cpu_line_dummies = pd.get_dummies(test_clean['CpuLine'], prefix='Cpu')
test_clean = pd.concat([test_clean, cpu_line_dummies], axis=1)

# Variables binarias útiles
test_clean['CpuIsIntel'] = (test_clean['CpuManufacturer'] == 'Intel').astype(int)
test_clean['CpuIsAMD'] = (test_clean['CpuManufacturer'] == 'AMD').astype(int)
test_clean['CpuIsCorei7'] = (test_clean['CpuLine'] == 'Core i7').astype(int)
test_clean['CpuIsCorei5'] = (test_clean['CpuLine'] == 'Core i5').astype(int)

# Algunos valores pueden tener problemas de formato
test_clean['CpuFrequencyGHz'] = pd.to_numeric(test_clean['CpuFrequencyGHz'], errors='coerce')
test_clean['CpuFrequencyGHz'] = test_clean['CpuFrequencyGHz'].fillna(test_clean['CpuFrequencyGHz'].median())

# Interacción entre CPU y RAM podría ser poderosa
test_clean['CpuTier_Ram_Interaction'] = test_clean['CpuTier'] * test_clean['Ram']

In [11]:
#COLUMNA MEMORY
def parse_memory(memory_string):
    """
    Extrae múltiples features de almacenamiento de un string como:
    '256GB SSD + 1TB HDD' o '512GB SSD'
    """
    features = {
        'total_capacity_gb': 0,
        'has_ssd': 0,
        'has_hdd': 0,
        'has_flash': 0,
        'has_hybrid': 0,
        'ssd_capacity_gb': 0,
        'hdd_capacity_gb': 0,
        'flash_capacity_gb': 0,
        'hybrid_capacity_gb': 0,
        'is_dual_drive': 0,
        'primary_type': 'unknown',
        'secondary_type': 'none'
    }
    
    # Normalizar string
    mem = memory_string.lower().replace(' ', '').replace('storage', '')
    
    # Dividir por '+' si hay múltiples drives
    parts = mem.split('+')
    features['is_dual_drive'] = 1 if len(parts) > 1 else 0
    
    drives = []
    for part in parts:
        # Extraer capacidad
        capacity_match = re.search(r'(\d+\.?\d*)(tb|gb|mb)', part)
        if capacity_match:
            value = float(capacity_match.group(1))
            unit = capacity_match.group(2)
            
            # Convertir todo a GB
            if unit == 'tb':
                capacity_gb = value * 1024
            elif unit == 'gb':
                capacity_gb = value
            elif unit == 'mb':
                capacity_gb = value / 1024
            
            # Identificar tipo
            drive_type = 'unknown'
            if 'ssd' in part:
                drive_type = 'ssd'
                features['has_ssd'] = 1
                features['ssd_capacity_gb'] += capacity_gb
            elif 'hdd' in part or 'hard' in part:
                drive_type = 'hdd'
                features['has_hdd'] = 1
                features['hdd_capacity_gb'] += capacity_gb
            elif 'flash' in part:
                drive_type = 'flash'
                features['has_flash'] = 1
                features['flash_capacity_gb'] += capacity_gb
            elif 'hybrid' in part:
                drive_type = 'hybrid'
                features['has_hybrid'] = 1
                features['hybrid_capacity_gb'] += capacity_gb
            
            drives.append({'type': drive_type, 'capacity_gb': capacity_gb})
    
    # Ordenar drives por capacidad (mayor primero)
    drives.sort(key=lambda x: x['capacity_gb'], reverse=True)
    
    # Calcular capacidad total
    features['total_capacity_gb'] = sum(d['capacity_gb'] for d in drives)
    
    # Asignar tipos primario y secundario
    if drives:
        features['primary_type'] = drives[0]['type']
        if len(drives) > 1:
            features['secondary_type'] = drives[1]['type']
    
    return pd.Series(features)

# Aplicar la función
memory_features = test_clean['Memory'].apply(parse_memory)
test_clean = pd.concat([test_clean, memory_features], axis=1)

# Proporción de SSD sobre capacidad total
test_clean['ssd_ratio'] = test_clean['ssd_capacity_gb'] / test_clean['total_capacity_gb'].replace(0, 1)

# ¿Es solo SSD? (característica premium)
test_clean['is_ssd_only'] = ((test_clean['has_ssd'] == 1) & 
                           (test_clean['has_hdd'] == 0) & 
                           (test_clean['has_hybrid'] == 0)).astype(int)

# ¿Es solo HDD? (característica económica)
test_clean['is_hdd_only'] = ((test_clean['has_hdd'] == 1) & 
                           (test_clean['has_ssd'] == 0) & 
                           (test_clean['has_hybrid'] == 0)).astype(int)

# ¿Tiene SSD para sistema operativo? (SSD >= 128GB)
test_clean['has_os_ssd'] = (test_clean['ssd_capacity_gb'] >= 128).astype(int)

# Clasificación de capacidad total (baja/media/alta)
def classify_capacity(total_gb):
    if total_gb < 256:
        return 'low'
    elif total_gb < 1024:  # 1TB
        return 'medium'
    else:
        return 'high'

test_clean['capacity_class'] = test_clean['total_capacity_gb'].apply(classify_capacity)

# One-hot para configuración de drives
config_categories = []
for idx, row in test_clean.iterrows():
    if row['is_dual_drive']:
        if row['has_ssd'] and row['has_hdd']:
            config = 'ssd_hdd_combo'
        elif row['has_ssd'] and row['has_ssd']:
            config = 'ssd_raid'
        elif row['has_hdd'] and row['has_hdd']:
            config = 'hdd_raid'
        else:
            config = 'other_combo'
    else:
        config = row['primary_type'] + '_only'
    
    config_categories.append(config)

test_clean['drive_config'] = config_categories

# One-hot encoding para configuraciones comunes
config_dummies = pd.get_dummies(test_clean['drive_config'], prefix='Drive')
test_clean = pd.concat([test_clean, config_dummies], axis=1)

# One-hot para capacity class
capacity_dummies = pd.get_dummies(test_clean['capacity_class'], prefix='Capacity')
test_clean = pd.concat([test_clean, capacity_dummies], axis=1)

# Costo aproximado de almacenamiento (proxy de valor)
# Valores estimados por GB: SSD $0.20/GB, HDD $0.03/GB, Flash $0.50/GB, Hybrid $0.10/GB
test_clean['storage_value_score'] = (
    test_clean['ssd_capacity_gb'] * 0.20 +
    test_clean['hdd_capacity_gb'] * 0.03 +
    test_clean['flash_capacity_gb'] * 0.50 +
    test_clean['hybrid_capacity_gb'] * 0.10
)

# Velocidad estimada del almacenamiento (proxy de performance)
test_clean['storage_speed_score'] = (
    test_clean['ssd_capacity_gb'] * 10 +
    test_clean['flash_capacity_gb'] * 8 +
    test_clean['hybrid_capacity_gb'] * 4 +
    test_clean['hdd_capacity_gb'] * 1
) / test_clean['total_capacity_gb'].replace(0, 1)

# Estas son las features que probablemente tendrán más impacto:
critical_memory_features = [
    'total_capacity_gb',          # Capacidad total
    'ssd_ratio',                  # Proporción de SSD
    'is_ssd_only',                # Solo SSD (premium)
    'has_os_ssd',                 # SSD para sistema
    'storage_value_score',        # Valor estimado
    'Drive_ssd_only',             # Configuración solo SSD
    'Drive_ssd_hdd_combo',        # Configuración combo
    'Capacity_high'               # Alta capacidad
]

# Puedes crear un sub-dataframe solo con estas features
df_memory_critical = test_clean[critical_memory_features]

In [None]:
def classify_gpu_performance(gpu_string):
    """
    Clasifica GPU en categorías de performance basado en conocimiento del mercado
    """
    if not isinstance(gpu_string, str):
        return 'integrated'
    
    gpu = gpu_string.lower()
    
    # 1. INTEGRADAS (bajo costo)
    if 'intel hd' in gpu or 'intel uhd' in gpu or 'intel iris' in gpu or 'intel graphics' in gpu:
        if 'iris pro' in gpu or 'iris plus' in gpu:
            return 'integrated_high'  # Intel Iris (mejor integrada)
        return 'integrated_low'  # Intel HD básica
    
    # 2. AMD
    elif 'amd' in gpu or 'radeon' in gpu:
        if 'rx 5' in gpu or 'rx 550' in gpu or 'rx 540' in gpu:
            return 'dedicated_low'  # AMD entry-level
        elif 'rx 580' in gpu:
            return 'dedicated_high'  # AMD high-end
        elif 'r7' in gpu or 'firepro' in gpu or 'radeon pro' in gpu:
            return 'dedicated_mid'  # AMD mid-range
        elif 'r5' in gpu or 'r4' in gpu or 'r3' in gpu or 'r2' in gpu:
            return 'dedicated_low'  # AMD very low-end
        else:
            return 'dedicated_mid'
    
    # 3. NVIDIA
    elif 'nvidia' in gpu or 'geforce' in gpu or 'quadro' in gpu:
        # GAMING HIGH-END
        if 'gtx 1080' in gpu or 'gtx 1070' in gpu or 'gtx 980' in gpu:
            return 'gaming_high'
        # GAMING MID
        elif 'gtx 1060' in gpu or 'gtx 1050' in gpu or 'gtx 970' in gpu or 'gtx 965' in gpu:
            return 'gaming_mid'
        # GAMING LOW/ENTRY
        elif 'gtx 960' in gpu or 'gtx 950' in gpu or 'gt 940' in gpu or 'mx150' in gpu or 'mx130' in gpu:
            return 'gaming_low'
        # WORKSTATION (Quadro)
        elif 'quadro' in gpu:
            if 'm3000' in gpu or 'm2200' in gpu or 'm2000' in gpu:
                return 'workstation_high'
            elif 'm1200' in gpu or 'm1000' in gpu or 'm620' in gpu or 'm520' in gpu:
                return 'workstation_mid'
            else:
                return 'workstation_low'
        # OLD/VERY LOW
        elif '920' in gpu or '930' in gpu or '150' in gpu:
            return 'dedicated_very_low'
        else:
            return 'dedicated_mid'
    
    return 'unknown'

test_clean['GpuCategory'] = test_clean['Gpu'].apply(classify_gpu_performance)

def extract_gpu_features(gpu_string):
    """
    Extrae características específicas de la GPU
    """
    features = {
        'gpu_manufacturer': 'unknown',
        'is_dedicated_gpu': 0,
        'is_gaming_gpu': 0,
        'is_workstation_gpu': 0,
        'is_integrated_gpu': 0,
        'gpu_model_number': None,
        'gpu_vram_estimate': 0  # Estimación en GB basada en modelo
    }
    
    if not isinstance(gpu_string, str):
        return pd.Series(features)
    
    gpu = gpu_string.lower()
    
    # Fabricante
    if 'intel' in gpu:
        features['gpu_manufacturer'] = 'intel'
        features['is_integrated_gpu'] = 1
    elif 'amd' in gpu or 'radeon' in gpu:
        features['gpu_manufacturer'] = 'amd'
        features['is_dedicated_gpu'] = 1
    elif 'nvidia' in gpu or 'geforce' in gpu or 'quadro' in gpu:
        features['gpu_manufacturer'] = 'nvidia'
        features['is_dedicated_gpu'] = 1
    
    # Tipo específico
    if 'geforce' in gpu:
        features['is_gaming_gpu'] = 1
    elif 'quadro' in gpu:
        features['is_workstation_gpu'] = 1
    elif 'radeon pro' in gpu or 'firepro' in gpu:
        features['is_workstation_gpu'] = 1
    
    # Extraer número de modelo (ej: 1060 de "GTX 1060")
    model_match = re.search(r'(gtx?\s*\d{3,4}|quadro\s*\w+\d+|rx\s*\d{3,4}|\d{3,4}[a-z]*)', gpu, re.IGNORECASE)
    if model_match:
        features['gpu_model_number'] = model_match.group(0).strip().upper()
    
    # Estimación de VRAM basada en modelo (conocimiento de dominio)
    if 'gtx 1080' in gpu or 'gtx 1070' in gpu or 'rx 580' in gpu:
        features['gpu_vram_estimate'] = 8
    elif 'gtx 1060' in gpu or 'gtx 980' in gpu or 'gtx 970' in gpu:
        features['gpu_vram_estimate'] = 6
    elif 'gtx 1050' in gpu or 'gtx 960' in gpu or 'quadro m2200' in gpu:
        features['gpu_vram_estimate'] = 4
    elif 'gtx 950' in gpu or '940mx' in gpu or 'rx 540' in gpu or 'rx 550' in gpu:
        features['gpu_vram_estimate'] = 2
    elif 'mx150' in gpu or 'mx130' in gpu or '930mx' in gpu or '920mx' in gpu:
        features['gpu_vram_estimate'] = 2
    elif features['is_integrated_gpu'] == 1:
        features['gpu_vram_estimate'] = 0  # Comparte memoria del sistema
    
    return pd.Series(features)

gpu_features = test_clean['Gpu'].apply(extract_gpu_features)
test_clean = pd.concat([test_clean, gpu_features], axis=1)

# Mapeo de performance score (0-10)
gpu_performance_score = {
    'gaming_high': 9,
    'workstation_high': 9,
    'gaming_mid': 7,
    'dedicated_high': 8,
    'workstation_mid': 7,
    'gaming_low': 5,
    'dedicated_mid': 6,
    'workstation_low': 5,
    'dedicated_low': 4,
    'integrated_high': 3,
    'dedicated_very_low': 2,
    'integrated_low': 1,
    'unknown': 0
}

test_clean['GpuPerformanceScore'] = test_clean['GpuCategory'].map(gpu_performance_score)

# Features combinadas útiles
test_clean['HasDedicatedGpu'] = test_clean['is_dedicated_gpu']
test_clean['HasGamingGpu'] = test_clean['is_gaming_gpu']
test_clean['HasWorkstationGpu'] = test_clean['is_workstation_gpu']

# Interacción con CPU (gaming setup típico)
test_clean['GamingSetup'] = ((test_clean['HasGamingGpu'] == 1) & 
                           (test_clean['CpuLine'].isin(['Core i7', 'Core i5']))).astype(int)

# Interacción con RAM (workstation setup)
test_clean['WorkstationSetup'] = ((test_clean['HasWorkstationGpu'] == 1) & 
                                (test_clean['CpuLine'].isin(['Core i7', 'Xeon']))).astype(int)
# Para GpuCategory: one-hot de las categorías más relevantes
gpu_cat_dummies = pd.get_dummies(test_clean['GpuCategory'], prefix='GPU')
# Mantener solo las más importantes (frecuencia > 10)
gpu_cat_counts = test_clean['GpuCategory'].value_counts()
important_cats = gpu_cat_counts[gpu_cat_counts > 10].index.tolist()
important_dummies = gpu_cat_dummies[[f'GPU_{cat}' for cat in important_cats if f'GPU_{cat}' in gpu_cat_dummies.columns]]
test_clean = pd.concat([test_clean, important_dummies], axis=1)

# Para fabricante: one-hot simple
gpu_man_dummies = pd.get_dummies(test_clean['gpu_manufacturer'], prefix='GPUMan')
test_clean = pd.concat([test_clean, gpu_man_dummies], axis=1)

# Estas son las features de GPU más importantes para predecir precio
critical_gpu_features = [
    'HasDedicatedGpu',           # ¿Tiene GPU dedicada? (impacto grande en precio)
    'GpuPerformanceScore',       # Score numérico de performance
    'gpu_vram_estimate',         # VRAM estimado
    'GPU_gaming_high',           # GPU gaming high-end
    'GPU_gaming_mid',            # GPU gaming mid-range
    'GPU_workstation_high',      # GPU workstation
    'GPU_integrated_low',        # GPU integrada básica
    'GPUMan_nvidia',             # Fabricante NVIDIA
    'GamingSetup',               # Setup gaming completo
    'WorkstationSetup'           # Setup workstation
]



Features de GPU creadas exitosamente.
Número total de features de GPU: 25


In [13]:
# Algunos valores tienen espacios extra o caracteres especiales
test_clean['Gpu'] = test_clean['Gpu'].str.strip()
test_clean['Gpu'] = test_clean['Gpu'].str.replace('  ', ' ')
# "Premium Gaming" = High-end GPU + High-end CPU + SSD
test_clean['PremiumGaming'] = ((test_clean['GPU_gaming_high'] == 1) & 
                             (test_clean['CpuIsCorei7'] == 1) & 
                             (test_clean['is_ssd_only'] == 1)).astype(int)

# "Budget" = Integrated GPU + HDD only
test_clean['BudgetConfig'] = ((test_clean['is_integrated_gpu'] == 1) & 
                            (test_clean['is_hdd_only'] == 1)).astype(int)