In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import urllib.request

# Cargar datos
df = pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\train.csv', index_col="laptop_ID")

# Procesamiento de datos
df["Ram"] = df["Ram"].str.replace("GB", "").astype(int)
df["Weight"] = df["Weight"].str.replace("kg", "").astype(float)

# Feature Engineering: Extraer información de la CPU
df["Cpu_Brand"] = df["Cpu"].apply(lambda x: x.split()[0])

# Seleccionar características
X = df.drop(columns=["Price_in_euros", "Product", "Cpu"])
y = df["Price_in_euros"]

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identificar variables categóricas y numéricas
categorical_features = ["Company", "TypeName", "ScreenResolution", "Gpu", "OpSys", "Cpu_Brand", "Memory"]
numeric_features = ["Inches", "Ram", "Weight"]

# Preprocesador
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Modelo
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, random_state=42, max_depth=15, min_samples_split=4))
])

# Entrenar modelo
model.fit(X_train, y_train)

# Evaluar modelo
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Predicción en el conjunto de test
X_pred = pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\test.csv', index_col="laptop_ID")

# Aplicar el mismo procesamiento
X_pred["Ram"] = X_pred["Ram"].str.replace("GB", "").astype(int)
X_pred["Weight"] = X_pred["Weight"].str.replace("kg", "").astype(float)
X_pred["Cpu_Brand"] = X_pred["Cpu"].apply(lambda x: x.split()[0])

# Eliminar columnas innecesarias
X_pred = X_pred.drop(columns=["Product", "Cpu"])

# Hacer predicciones
predictions = model.predict(X_pred)




RMSE: 375.42935787215805


In [4]:
# Guardar resultados
sample = pd.DataFrame({"laptop_ID": X_pred.index, "Price_in_euros": predictions})

In [5]:
sample.head()

Unnamed: 0,laptop_ID,Price_in_euros
0,209,1703.047139
1,1281,306.637998
2,1168,421.173468
3,1231,1065.410133
4,1020,831.497678


In [6]:
sample.shape

(391, 2)

In [7]:
submission = pd.DataFrame({"laptop_ID": X_pred.index, "Price_in_euros": predictions})

In [8]:
submission.head()

Unnamed: 0,laptop_ID,Price_in_euros
0,209,1703.047139
1,1281,306.637998
2,1168,421.173468
3,1231,1065.410133
4,1020,831.497678


In [9]:
submission.shape

(391, 2)

In [10]:
def chequeador(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.laptop_ID.all() == sample.laptop_ID.all():
                print("You're ready to submit!")
                submission.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://www.mihaileric.com/static/evaluation-meme-e0a350f278a36346e6d46b139b1d0da0-ed51e.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto del TA: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `test.csv`. Lloro.")

In [11]:
chequeador(submission)

You're ready to submit!


NameError: name 'Image' is not defined