In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import re
import urllib.request

In [5]:
df = pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\train.csv', index_col="laptop_ID")


In [6]:
df.head()

Unnamed: 0_level_0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_in_euros
laptop_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
755,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i3 6006U 2GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 10,1.86kg,539.0
618,Dell,Inspiron 7559,Gaming,15.6,Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,16GB,1TB HDD,Nvidia GeForce GTX 960<U+039C>,Windows 10,2.59kg,879.01
909,HP,ProBook 450,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,1TB HDD,Nvidia GeForce 930MX,Windows 10,2.04kg,900.0
2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
286,Dell,Inspiron 3567,Notebook,15.6,Full HD 1920x1080,Intel Core i3 6006U 2.0GHz,4GB,1TB HDD,AMD Radeon R5 M430,Linux,2.25kg,428.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 912 entries, 755 to 229
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           912 non-null    object 
 1   Product           912 non-null    object 
 2   TypeName          912 non-null    object 
 3   Inches            912 non-null    float64
 4   ScreenResolution  912 non-null    object 
 5   Cpu               912 non-null    object 
 6   Ram               912 non-null    object 
 7   Memory            912 non-null    object 
 8   Gpu               912 non-null    object 
 9   OpSys             912 non-null    object 
 10  Weight            912 non-null    object 
 11  Price_in_euros    912 non-null    float64
dtypes: float64(2), object(10)
memory usage: 92.6+ KB


In [9]:
df.tail()

Unnamed: 0_level_0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_in_euros
laptop_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
28,Dell,Inspiron 5570,Notebook,15.6,Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,AMD Radeon 530,Windows 10,2.2kg,800.0
1160,HP,Spectre Pro,2 in 1 Convertible,13.3,Full HD / Touchscreen 1920x1080,Intel Core i5 6300U 2.4GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 10,1.48kg,1629.0
78,Lenovo,IdeaPad 320-15IKBN,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,2TB HDD,Intel HD Graphics 620,No OS,2.2kg,519.0
23,HP,255 G6,Notebook,15.6,1366x768,AMD E-Series E2-9000e 1.5GHz,4GB,500GB HDD,AMD Radeon R2,No OS,1.86kg,258.0
229,Dell,Alienware 17,Gaming,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1060,Windows 10,4.42kg,2456.34


In [13]:
df.describe()

Unnamed: 0,Inches,Price_in_euros
count,912.0,912.0
mean,14.981579,1111.72409
std,1.436719,687.959172
min,10.1,174.0
25%,14.0,589.0
50%,15.6,978.0
75%,15.6,1483.9425
max,18.4,6099.0


In [14]:

# Procesamiento de datos
df["Ram"] = df["Ram"].str.replace("GB", "").astype(int)
df["Weight"] = df["Weight"].str.replace("kg", "").astype(float)

# Extraer información más detallada de la CPU
def extract_cpu_info(cpu):
    parts = cpu.split()
    if len(parts) > 2 and parts[0] == "Intel":
        return f"{parts[1]} {parts[2]}"
    return parts[0]

df["Cpu_Brand"] = df["Cpu"].apply(extract_cpu_info)

# Extraer resolución de pantalla
def extract_resolution(screen_res):
    match = re.search(r'(\d+)x(\d+)', str(screen_res))  # Asegurar que el valor sea string
    if match:
        return int(match.group(1)) * int(match.group(2))  # Multiplicar ancho x alto
    return np.nan  # Devolver NaN si no hay coincidencia
df["Resolution"] = df["ScreenResolution"].apply(extract_resolution)

# Separar memoria en SSD y HDD
def process_memory(mem):
    ssd, hdd = 0, 0
    for part in str(mem).split():
        if "SSD" in part:
            ssd = int(re.search(r'(\d+)', part).group(1)) if re.search(r'(\d+)', part) else 0
        elif "HDD" in part:
            hdd = int(re.search(r'(\d+)', part).group(1)) if re.search(r'(\d+)', part) else 0
    return pd.Series([ssd, hdd])
df[['SSD', 'HDD']] = df['Memory'].apply(process_memory)

# Codificar sistema operativo (Windows o no)
df['Windows'] = df['OpSys'].apply(lambda x: 1 if 'Windows' in str(x) else 0)

# Seleccionar características
X = df.drop(columns=["Price_in_euros", "Product", "Cpu", "OpSys", "Memory", "ScreenResolution"])
y = df["Price_in_euros"]

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identificar variables categóricas y numéricas
categorical_features = ["Company", "TypeName", "Gpu", "Cpu_Brand"]
numeric_features = ["Inches", "Ram", "Weight", "Resolution", "SSD", "HDD", "Windows"]

# Preprocesador
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Modelo con optimización de hiperparámetros
param_grid = {
    "regressor__n_estimators": [100, 200, 300],
    "regressor__max_depth": [10, 15, 20],
    "regressor__min_samples_split": [2, 4, 6],
    "regressor__min_samples_leaf": [1, 2, 4]
}

model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

search = GridSearchCV(model, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
search.fit(X_train, y_train)

# Evaluar modelo
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Optimized RMSE: {rmse}")



Optimized RMSE: 380.38124196031555


In [15]:
# Cargar datos de prueba
X_pred = pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\test.csv', index_col="laptop_ID")

# Aplicar el mismo procesamiento
X_pred["Ram"] = X_pred["Ram"].str.replace("GB", "").astype(int)
X_pred["Weight"] = X_pred["Weight"].str.replace("kg", "").astype(float)
X_pred["Cpu_Brand"] = X_pred["Cpu"].apply(extract_cpu_info)
X_pred["Resolution"] = X_pred["ScreenResolution"].apply(extract_resolution)
X_pred[['SSD', 'HDD']] = X_pred['Memory'].apply(process_memory)
X_pred['Windows'] = X_pred['OpSys'].apply(lambda x: 1 if 'Windows' in str(x) else 0)

# Eliminar columnas innecesarias
X_pred = X_pred.drop(columns=["Product", "Cpu", "OpSys", "Memory", "ScreenResolution"])

# Hacer predicciones
predictions = best_model.predict(X_pred)

In [17]:
X_pred

Unnamed: 0_level_0,Company,TypeName,Inches,Ram,Gpu,Weight,Cpu_Brand,Resolution,SSD,HDD,Windows
laptop_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
209,Lenovo,Gaming,15.6,16,Nvidia GeForce GTX 1060,2.400,Core i7,2073600,0,0,0
1281,Acer,Notebook,15.6,4,Intel HD Graphics 400,2.400,Celeron Dual,1049088,0,0,0
1168,Lenovo,Notebook,15.6,4,Intel HD Graphics 520,1.900,Core i3,1049088,0,0,0
1231,Dell,2 in 1 Convertible,15.6,8,Intel HD Graphics 620,2.191,Core i5,2073600,0,0,1
1020,HP,Notebook,14.0,4,Intel HD Graphics 620,1.950,Core i5,2073600,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
820,MSI,Gaming,17.3,16,Nvidia GeForce GTX 1070,2.900,Core i7,2073600,0,0,1
948,Toshiba,Notebook,14.0,4,Intel HD Graphics 520,1.470,Core i5,2073600,0,0,1
483,Dell,Workstation,15.6,8,Nvidia Quadro M1200,1.780,Core i7,2073600,0,0,1
1017,HP,Notebook,14.0,4,Intel HD Graphics 620,1.640,Core i5,1049088,0,0,1


In [16]:
predictions

array([1598.71566852,  291.30766279,  424.81445556,  891.68823807,
        986.05248333,  430.2524797 ,  861.2394    , 1049.30637143,
       1062.47097245,  383.77586   , 2239.01507788, 1415.04606111,
        578.3443    , 1571.81468801,  705.5247    ,  691.44336667,
       2078.25103877, 1313.13997778, 1628.83576722,  668.95185556,
       1713.03507317,  406.55691794,  730.0661    , 1264.46271896,
        468.40885661,  768.48453298,  539.91163333,  756.05634444,
       3054.82755111, 1215.74383148, 2399.46726677,  464.37657735,
        710.15751667, 3187.54067833, 2007.11526099, 1485.38321118,
        751.04898333, 1484.42053233,  924.50919852, 1435.41301515,
        619.19189167,  806.42566667,  540.44434444, 1167.83034719,
       1391.348388  , 1022.40915106,  938.45336052,  674.89242487,
        582.83284667,  368.01019072, 1777.25131472,  820.54372183,
       1062.09900815,  665.532765  , 1977.72318333, 2091.74337111,
        615.22516667, 1029.065282  , 1040.68732838,  545.47392

In [19]:
sample= pd.read_csv(r'C:\Users\alber\OneDrive\Documentos\GitHub\DS_ONLINE_THEBRIDGE_ATC\Masterclass\Kaggle\data\sample_submission.csv')

In [20]:
sample.head()

Unnamed: 0,laptop_ID,Price_in_euros
0,209,1949.1
1,1281,805.0
2,1168,1101.0
3,1231,1293.8
4,1020,1832.6


In [21]:
sample.shape

(391, 2)

In [22]:
submission = pd.DataFrame({"laptop_ID": X_pred.index, "Price_in_euros": predictions})

In [23]:
submission.head()

Unnamed: 0,laptop_ID,Price_in_euros
0,209,1598.715669
1,1281,291.307663
2,1168,424.814456
3,1231,891.688238
4,1020,986.052483


In [24]:
submission.shape

(391, 2)

In [27]:
def chequeador(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.laptop_ID.all() == sample.laptop_ID.all():
                print("You're ready to submit!")
                submission.to_csv("submission5.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://www.mihaileric.com/static/evaluation-meme-e0a350f278a36346e6d46b139b1d0da0-ed51e.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto del TA: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `test.csv`. Lloro.")

In [28]:
chequeador(submission)

You're ready to submit!


NameError: name 'Image' is not defined