In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import re

# Cargar datos
df = pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\train.csv', index_col="laptop_ID")

# Procesamiento de datos
df["Ram"] = df["Ram"].str.replace("GB", "").astype(int)
df["Weight"] = df["Weight"].str.replace("kg", "").astype(float)

# Extraer información más detallada de la CPU
def extract_cpu_info(cpu):
    parts = str(cpu).split()
    if len(parts) > 2 and parts[0] == "Intel":
        return f"{parts[1]} {parts[2]}"
    return parts[0]
df["Cpu_Brand"] = df["Cpu"].apply(extract_cpu_info)

# Extraer la frecuencia del CPU con manejo de errores
def extract_cpu_freq(cpu):
    if isinstance(cpu, str):
        match = re.search(r'(\d+\.\d+)', cpu)
        return float(match.group(1)) if match else np.nan
    return np.nan
df["Cpu_Freq_GHz"] = df["Cpu"].apply(extract_cpu_freq)

# Extraer resolución de pantalla
def extract_resolution(screen_res):
    match = re.search(r'(\d+)x(\d+)', str(screen_res))
    if match:
        return int(match.group(1)) * int(match.group(2))
    return np.nan
df["Resolution"] = df["ScreenResolution"].apply(extract_resolution)

# Identificar Touchscreen
df["Touchscreen"] = df["ScreenResolution"].apply(lambda x: 1 if "Touchscreen" in str(x) else 0)

# Separar memoria en SSD y HDD
def process_memory(mem):
    ssd, hdd = 0, 0
    parts = str(mem).split()
    for i, part in enumerate(parts):
        if "SSD" in part:
            ssd = int(parts[i-1]) if i > 0 and parts[i-1].isdigit() else 0
        elif "HDD" in part:
            hdd = int(parts[i-1]) if i > 0 and parts[i-1].isdigit() else 0
    return pd.Series([ssd, hdd])
df[['SSD', 'HDD']] = df['Memory'].apply(process_memory)

# Extraer GPU Brand
def extract_gpu_brand(gpu):
    brands = ["Nvidia", "AMD", "Intel"]
    for brand in brands:
        if brand in str(gpu):
            return brand
    return "Other"
df["Gpu_Brand"] = df["Gpu"].apply(extract_gpu_brand)

# Codificar sistema operativo (Windows o no)
df['Windows'] = df['OpSys'].apply(lambda x: 1 if 'Windows' in str(x) else 0)

# Seleccionar características
X = df.drop(columns=["Price_in_euros", "Product", "Cpu", "OpSys", "Memory", "ScreenResolution", "Gpu"])
y = df["Price_in_euros"]

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identificar variables categóricas y numéricas
categorical_features = ["Company", "TypeName", "Cpu_Brand", "Gpu_Brand"]
numeric_features = ["Inches", "Ram", "Weight", "Resolution", "SSD", "HDD", "Windows", "Cpu_Freq_GHz", "Touchscreen"]

# Preprocesador con imputación para valores NaN
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Modelo con optimización de hiperparámetros
param_dist = {
    "regressor__n_estimators": [100, 200, 300],
    "regressor__learning_rate": [0.01, 0.05, 0.1],
    "regressor__max_depth": [3, 5, 7],
    "regressor__min_samples_split": [2, 4, 6]
}

model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(random_state=42))
])

search = RandomizedSearchCV(model, param_dist, cv=3, scoring='neg_root_mean_squared_error', n_iter=10, n_jobs=-1, random_state=42)
search.fit(X_train, y_train)

# Evaluar modelo
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Optimized RMSE: {rmse}")

# Cargar datos de prueba
X_pred = pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\test.csv', index_col="laptop_ID")

# Aplicar el mismo procesamiento
X_pred["Ram"] = X_pred["Ram"].str.replace("GB", "").astype(int)
X_pred["Weight"] = X_pred["Weight"].str.replace("kg", "").astype(float)
X_pred["Cpu_Brand"] = X_pred["Cpu"].apply(extract_cpu_info)
X_pred["Cpu_Freq_GHz"] = X_pred["Cpu"].apply(extract_cpu_freq)
X_pred["Resolution"] = X_pred["ScreenResolution"].apply(extract_resolution)
X_pred["Touchscreen"] = X_pred["ScreenResolution"].apply(lambda x: 1 if "Touchscreen" in str(x) else 0)
X_pred[['SSD', 'HDD']] = X_pred['Memory'].apply(process_memory)
X_pred["Gpu_Brand"] = X_pred["Gpu"].apply(extract_gpu_brand)
X_pred['Windows'] = X_pred['OpSys'].apply(lambda x: 1 if 'Windows' in str(x) else 0)

# Eliminar columnas innecesarias
X_pred = X_pred.drop(columns=["Product", "Cpu", "OpSys", "Memory", "ScreenResolution", "Gpu"])

# Hacer predicciones
predictions = best_model.predict(X_pred)


Optimized RMSE: 402.42549824567493
