### Backpack Prediction Challenge

**Welcome to my notebook for the Backpack Price Prediction Challenge from the 2025 Kaggle Playground Series! 🏆 This competition presents an exciting opportunity to explore tabular data, refine our feature engineering, and experiment with different machine learning models to predict backpack prices.**


In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

In [20]:
# Importing the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Checking the number of rows and columns of data
print("train: ",train.shape)
print("test: ", test.shape)

train:  (300000, 11)
test:  (200000, 10)


In [21]:
# See the first few lines of train data
train.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [22]:
# See the first few lines of test data
test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [23]:
# Missing value check
train.isnull().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [24]:
import pandas as pd

# Fill in missing values ​​with median
train['Weight Capacity (kg)'].fillna(train['Weight Capacity (kg)'].median(), inplace=True)

# Check the results
print(train.isnull().sum())


id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)       0
Price                      0
dtype: int64


In [25]:
# Count the number of unique values ​​in each column
train.nunique()

id                      300000
Brand                        5
Material                     4
Size                         3
Compartments                10
Laptop Compartment           2
Waterproof                   2
Style                        3
Color                        6
Weight Capacity (kg)    181596
Price                    48212
dtype: int64

In [26]:
# Looking at unique values ​​in the "Price" column in the train dataset
train.Price.unique()

array([112.15875,  68.88056,  39.1732 , ...,  78.7574 , 131.37288,
        41.96325])

In [27]:
# See general information of the train dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  300000 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [28]:
# See general information of the test dataset
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    200000 non-null  int64  
 1   Brand                 193773 non-null  object 
 2   Material              194387 non-null  object 
 3   Size                  195619 non-null  object 
 4   Compartments          200000 non-null  float64
 5   Laptop Compartment    195038 non-null  object 
 6   Waterproof            195189 non-null  object 
 7   Style                 194847 non-null  object 
 8   Color                 193215 non-null  object 
 9   Weight Capacity (kg)  199923 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 15.3+ MB


In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def convert_object_columns(df):
    # Convert categorical variables of type object to numeric values ​​using Label Encoder
    label_encoder = LabelEncoder()
    
    for col in df.select_dtypes(include=['object']).columns:
        if df[col].nunique() == 2:  
            df[col] = label_encoder.fit_transform(df[col].astype(str))
        else:
            # If there are categorical variables, convert them to numerical values
            df[col] = df[col].astype('category').cat.codes
    
    return df

# Transformation process for train and test data
train = convert_object_columns(train)
test = convert_object_columns(test)

# Check the results
print(train.dtypes)
print(test.dtypes)

id                        int64
Brand                      int8
Material                   int8
Size                       int8
Compartments            float64
Laptop Compartment        int32
Waterproof                int32
Style                      int8
Color                      int8
Weight Capacity (kg)    float64
Price                   float64
dtype: object
id                        int64
Brand                      int8
Material                   int8
Size                       int8
Compartments            float64
Laptop Compartment        int32
Waterproof                int32
Style                      int8
Color                      int8
Weight Capacity (kg)    float64
dtype: object


In [30]:
# Create input properties by dropping id and Rings columns

X = train.drop(['id', 'Price'], axis=1).values
y = train['Price'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [31]:
# Create an XGBoost Regression using the XGBRegressor class and then fit the training data to the model using the fit() method

from xgboost.sklearn import XGBRegressor

regressor = XGBRegressor()

regressor.fit(X_train, y_train)

In [32]:
# Make predictions based on input features (X_test) in the test set using a trained regression model
y_pred = regressor.predict(X_test)
print(y_pred)

[83.85969  79.585205 81.46645  ... 88.658226 78.268166 86.000046]


In [33]:
# Create a new data frame by removing certain column from the data frame named "test"
test1 = test.drop(columns = 'id')

In [34]:
# Create a NumPy array by selecting all rows and columns of dataframe test1
unseen = test1.iloc[:, :].values

In [35]:
# Make predictions on "unseen" data with a model called regressor
unseen_pred = regressor.predict(unseen)

In [36]:
# Adds a new column named "Rings" to the test data frame and assigns the values of the array named "unseen_pred" to this column
test['Price'] = unseen_pred

In [37]:
test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,300000,3,1,2,2.0,0,0,2,3,20.671147,81.832939
1,300001,2,0,1,7.0,0,1,0,3,13.564105,80.459015
2,300002,0,0,0,9.0,0,1,1,1,11.809799,87.955612
3,300003,0,2,0,1.0,1,0,1,3,18.477036,80.215759
4,300004,-1,2,0,2.0,1,1,2,0,9.907953,72.448708


In [38]:
# Create a DataFrame containing predictions
# submission = test[['id', 'Price']]

# Convert to the format required by the submission file (for example, CSV format)
# submission.to_csv('submission.csv', index=False)