In [206]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [207]:
#Cargar datos
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [208]:
#Informacion basica del dataset
print(f"Dimensiones del dataset: {df.shape}")
display(df.head())

Dimensiones del dataset: (300000, 11)


Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [209]:
print("Información de las columnas:")
display(df.info())

Información de las columnas:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


None

In [210]:
print("Estadísticas descriptivas:")
display(df.describe())

Estadísticas descriptivas:


Unnamed: 0,id,Compartments,Weight Capacity (kg),Price
count,300000.0,300000.0,299862.0,300000.0
mean,149999.5,5.44359,18.029994,81.411107
std,86602.684716,2.890766,6.966914,39.03934
min,0.0,1.0,5.0,15.0
25%,74999.75,3.0,12.097867,47.38462
50%,149999.5,5.0,18.068614,80.95612
75%,224999.25,8.0,24.002375,115.01816
max,299999.0,10.0,30.0,150.0


In [211]:
test['Size'] = test['Size'].fillna(test['Size'].mode()[0])
df.dropna(inplace=True)
print(df['Size'].unique())

['Medium' 'Small' 'Large']


In [212]:
display(df.info())
display(test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 246686 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    246686 non-null  int64  
 1   Brand                 246686 non-null  object 
 2   Material              246686 non-null  object 
 3   Size                  246686 non-null  object 
 4   Compartments          246686 non-null  float64
 5   Laptop Compartment    246686 non-null  object 
 6   Waterproof            246686 non-null  object 
 7   Style                 246686 non-null  object 
 8   Color                 246686 non-null  object 
 9   Weight Capacity (kg)  246686 non-null  float64
 10  Price                 246686 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 22.6+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    200000 non-null  int64  
 1   Brand                 193773 non-null  object 
 2   Material              194387 non-null  object 
 3   Size                  200000 non-null  object 
 4   Compartments          200000 non-null  float64
 5   Laptop Compartment    195038 non-null  object 
 6   Waterproof            195189 non-null  object 
 7   Style                 194847 non-null  object 
 8   Color                 193215 non-null  object 
 9   Weight Capacity (kg)  199923 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 15.3+ MB


None

In [213]:
size_order = ['Small', 'Medium', 'Large']
# 2. Configura el encoder
encoder = OrdinalEncoder(categories=[size_order])

# 3. Aplica a la columna 'Size'
df['Size'] = encoder.fit_transform(df[['Size']])
test['Size'] = encoder.transform(test[['Size']])

In [214]:
categorical_cols = ['Brand', 'Material','Laptop Compartment','Waterproof', 'Style', 'Color']
df = pd.get_dummies(df,columns=categorical_cols)
test = pd.get_dummies(test,columns=categorical_cols)

In [215]:
display(df.head())

Unnamed: 0,id,Size,Compartments,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,Brand_Under Armour,...,Waterproof_Yes,Style_Backpack,Style_Messenger,Style_Tote,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red
0,0,1.0,7.0,11.611723,112.15875,False,True,False,False,False,...,False,False,False,True,True,False,False,False,False,False
1,1,0.0,10.0,27.078537,68.88056,False,True,False,False,False,...,True,False,True,False,False,False,False,True,False,False
2,2,0.0,2.0,16.64376,39.1732,False,False,False,False,True,...,False,False,True,False,False,False,False,False,False,True
3,3,0.0,8.0,12.93722,80.60793,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,False
4,4,1.0,1.0,17.749338,86.02312,True,False,False,False,False,...,True,False,True,False,False,False,False,True,False,False


In [216]:
X = df.drop(['id','Price'],axis=1)
Y = df['Price']
X_train = df.drop(['id','Price'],axis=1)
Y_train = df['Price']
X_valid = df.drop(['id','Price'],axis=1)
Y_valid = df['Price']
X_test = test.drop(['id'],axis=1)



In [217]:
model = xgb.XGBRegressor(
    objective='reg:squarederror',  # Función de pérdida para regresión
    n_estimators=1000,            # Número de árboles
    learning_rate=0.05,           # Tasa de aprendizaje
    max_depth=5,                  # Profundidad máxima de los árboles
    early_stopping_rounds=10,     # Parada temprana si no mejora
    eval_metric='rmse'            # Métrica de evaluación
)

model.fit(
    X_train, Y_train,
    eval_set=[(X_valid, Y_valid)],  # Datos de validación
    verbose=True                  # Muestra progreso
)

# Predicciones
predictions = model.predict(X_test)

[0]	validation_0-rmse:38.93690
[1]	validation_0-rmse:38.93410
[2]	validation_0-rmse:38.93158
[3]	validation_0-rmse:38.92925
[4]	validation_0-rmse:38.92715
[5]	validation_0-rmse:38.92496
[6]	validation_0-rmse:38.92313
[7]	validation_0-rmse:38.92113
[8]	validation_0-rmse:38.91932
[9]	validation_0-rmse:38.91772
[10]	validation_0-rmse:38.91603
[11]	validation_0-rmse:38.91439
[12]	validation_0-rmse:38.91287
[13]	validation_0-rmse:38.91138
[14]	validation_0-rmse:38.91005
[15]	validation_0-rmse:38.90842
[16]	validation_0-rmse:38.90697
[17]	validation_0-rmse:38.90556
[18]	validation_0-rmse:38.90403
[19]	validation_0-rmse:38.90260
[20]	validation_0-rmse:38.90143
[21]	validation_0-rmse:38.90017
[22]	validation_0-rmse:38.89893
[23]	validation_0-rmse:38.89773
[24]	validation_0-rmse:38.89655
[25]	validation_0-rmse:38.89537
[26]	validation_0-rmse:38.89423
[27]	validation_0-rmse:38.89298
[28]	validation_0-rmse:38.89175
[29]	validation_0-rmse:38.89068
[30]	validation_0-rmse:38.88953
[31]	validation_0-

In [218]:
print(len(sub.Price))

200000


In [219]:
sub = pd.read_csv('sample_submission.csv')
sub.Price = predictions
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,id,Price
0,300000,82.390839
1,300001,82.201706
2,300002,85.009735
3,300003,79.209511
4,300004,75.155807
