In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb

# Cargar datos
df = pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\train.csv')
test_df = pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\test.csv')  # Asegúrate de tener el archivo test.csv

def preprocess_data(df):
    # Eliminar columnas irrelevantes
    df.drop(columns=['ScreenResolution', 'Cpu', 'Gpu', 'Memory'], inplace=True)

    # Limpiar y transformar Weight y Ram
    df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)
    df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)

    # Extraer resolución horizontal
    df['Resolution_X'] = df['ScreenResolution'].str.extract(r'(\d+)x')[0].astype(float)

    # Crear características nuevas
    df['PPI'] = ((df['Resolution_X']**2) + (df['Inches']**2))**0.5 / df['Inches']

    # Crear dummy variables para Company, TypeName y OpSys
    df = pd.get_dummies(df, columns=['Company', 'TypeName', 'OpSys'], drop_first=True)

    return df

# Preprocesar los datos de entrenamiento y prueba
df_cleaned = preprocess_data(df)
test_cleaned = preprocess_data(test_df)

# Separar variables predictoras y objetivo
X = df_cleaned.drop('Price_in_euros', axis=1)
y = df_cleaned['Price_in_euros']

# Escalar características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_cleaned.drop(columns=['laptop_ID']))

# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Modelo inicial con RandomForest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predicciones
y_pred = rf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE con RandomForest: {rmse}")

# Optimización con GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Mejor modelo
grid_best_model = grid_search.best_estimator_
y_pred_optimized = grid_best_model.predict(X_test)
rmse_optimized = mean_squared_error(y_test, y_pred_optimized, squared=False)
print(f"RMSE optimizado con RandomForest: {rmse_optimized}")

# Probar XGBoost
xgboost_model = xgb.XGBRegressor(random_state=42)
xgboost_model.fit(X_train, y_train)

# Predicción con XGBoost
y_pred_xgb = xgboost_model.predict(X_test)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print(f"RMSE con XGBoost: {rmse_xgb}")

# Si el RMSE está por debajo de 300, preparar las predicciones
if rmse_optimized < 300:
    # Generar predicciones para el conjunto de prueba
    test_predictions = grid_best_model.predict(test_scaled)

    # Crear archivo de submission
    submission = pd.DataFrame({
        'laptop_ID': test_df['laptop_ID'],  # Usar laptop_ID de test.csv
        'Price_in_euros': test_predictions
    })

    # Guardar el archivo
    submission.to_csv("submission.csv", index=False)
    print("Archivo submission.csv generado correctamente.")
else:
    print("El RMSE optimizado no es menor a 300. No se generará el archivo submission.csv.")


KeyError: 'ScreenResolution'