# Backpack Prediction Challenges

### Step 1 - Data Ingestion

In [2]:
import pandas as pd
df = pd.read_csv("Backpack Prediction(KNN)_train.csv")
df.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),id,Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,0,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,1,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,2,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,3,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,4,86.02312


### Step 2 - Perform Basic Data Quality Checks

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Brand                 300000 non-null  object 
 1   Material              300000 non-null  object 
 2   Size                  300000 non-null  object 
 3   Compartments          300000 non-null  float64
 4   Laptop Compartment    300000 non-null  object 
 5   Waterproof            300000 non-null  object 
 6   Style                 300000 non-null  object 
 7   Color                 300000 non-null  object 
 8   Weight Capacity (kg)  300000 non-null  float64
 9   id                    300000 non-null  int64  
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [5]:
df.isna().sum()

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
id                      0
Price                   0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.columns

Index(['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)', 'id', 'Price'],
      dtype='object')

### Step 3 - Seperate X and Y

In [8]:
X = df.drop(columns=["id", "Price"])
Y = df[["Price"]]

In [9]:
X.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338


In [10]:
Y.head()

Unnamed: 0,Price
0,112.15875
1,68.88056
2,39.1732
3,80.60793
4,86.02312


### Step 4 - Apply Preprocessing on X

In [11]:
X.dtypes

Brand                    object
Material                 object
Size                     object
Compartments            float64
Laptop Compartment       object
Waterproof               object
Style                    object
Color                    object
Weight Capacity (kg)    float64
dtype: object

In [12]:
cat = list(X.columns[X.dtypes == "object"])
con = list(X.columns[X.dtypes != "object"])

In [13]:
cat

['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [14]:
con

['Compartments', 'Weight Capacity (kg)']

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

In [17]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)

In [18]:
pre = ColumnTransformer(
    [
        ("num", num_pipe, con),
        ("cat", cat_pipe, cat)
    ]
).set_output(transform="pandas")

In [19]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,num__Compartments,num__Weight Capacity (kg),cat__Brand_Adidas,cat__Brand_Jansport,cat__Brand_Nike,cat__Brand_Puma,cat__Brand_Under Armour,cat__Material_Canvas,cat__Material_Leather,cat__Material_Nylon,...,cat__Waterproof_Yes,cat__Style_Backpack,cat__Style_Messenger,cat__Style_Tote,cat__Color_Black,cat__Color_Blue,cat__Color_Gray,cat__Color_Green,cat__Color_Pink,cat__Color_Red
0,0.538408,-0.921426,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.576198,1.299024,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.19124,-0.199016,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.884338,-0.731135,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.53717,-0.040297,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Step 5 - Train Test Split

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
xtrain, xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.2, random_state=46)

In [22]:
xtrain.head()

Unnamed: 0,num__Compartments,num__Weight Capacity (kg),cat__Brand_Adidas,cat__Brand_Jansport,cat__Brand_Nike,cat__Brand_Puma,cat__Brand_Under Armour,cat__Material_Canvas,cat__Material_Leather,cat__Material_Nylon,...,cat__Waterproof_Yes,cat__Style_Backpack,cat__Style_Messenger,cat__Style_Tote,cat__Color_Black,cat__Color_Blue,cat__Color_Gray,cat__Color_Green,cat__Color_Pink,cat__Color_Red
161325,-0.84531,-1.403557,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
163496,-1.19124,-0.949295,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
261697,0.884338,1.512781,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
82951,-0.84531,1.176284,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
262857,-0.499381,-0.815456,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [23]:
ytrain.head()

Unnamed: 0,Price
161325,87.75687
163496,134.04751
261697,99.73909
82951,124.35109
262857,28.86967


In [24]:
xtest.head()

Unnamed: 0,num__Compartments,num__Weight Capacity (kg),cat__Brand_Adidas,cat__Brand_Jansport,cat__Brand_Nike,cat__Brand_Puma,cat__Brand_Under Armour,cat__Material_Canvas,cat__Material_Leather,cat__Material_Nylon,...,cat__Waterproof_Yes,cat__Style_Backpack,cat__Style_Messenger,cat__Style_Tote,cat__Color_Black,cat__Color_Blue,cat__Color_Gray,cat__Color_Green,cat__Color_Pink,cat__Color_Red
90250,1.576198,1.180568,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
68766,-0.153451,1.270299,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
28522,-0.499381,0.308236,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11059,0.884338,-1.87062,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
101803,-1.53717,-0.877581,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [25]:
ytest.head()

Unnamed: 0,Price
90250,131.7932
68766,90.45775
28522,45.84078
11059,115.80788
101803,15.17464


In [26]:
xtrain.shape

(240000, 27)

In [27]:
ytrain.shape

(240000, 1)

### Step 6 - Model Selection

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

In [38]:
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(max_depth=3)
]

In [39]:
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [40]:
def evaluate_single_model(model, xtrain, ytrain, xtest, ytest):

    # Fit the model 
    model.fit(xtrain, ytrain)

    # Predict the results for train and test
    ypred_train = model.predict(xtrain)
    ypred_test = model.predict(xtest)

    # Calculate the regression metrics for train and test
    rmse_train = root_mean_squared_error(ytrain, ypred_train)
    rmse_test = root_mean_squared_error(ytest, ypred_test)

    mse_train = mean_squared_error(ytrain, ypred_train)
    mse_test = mean_squared_error(ytest, ypred_test)

    r2_train = r2_score(ytrain, ypred_train)
    r2_test = r2_score(ytest, ypred_test)

    # 5 fold crossvalidaton on train data
    scores = cross_val_score(model, xtrain, ytrain, cv=5, scoring="r2", n_jobs=-1)
    r2_cv = scores.mean()

    # Create a dictionary for final results
    res = {
        "model_name": model.__class__.__name__,
        "model": model,
        "rmse_train": rmse_train,
        "mse_train": mse_train,
        "r2_train": r2_train,
        "rmse_test":rmse_test,
        "mse_test":mse_test,
        "r2_test":r2_test,
        "r2_cv": r2_cv
    }

    return res


In [41]:
def algo_evaluation(models: list, xtrain, ytrain, xtest, ytest):

    # Intialize blank list for results
    results = []

    # Apply for loop on models
    for model in models:
        r = evaluate_single_model(model, xtrain, ytrain, xtest, ytest)
        print(r)
        results.append(r)

    # Save the results in dataframe
    res_df = pd.DataFrame(results)

    # Sort the results
    sort_df = res_df.sort_values(by="rmse_test").reset_index(drop=True)

    # Get the best model
    best_model = sort_df.iloc[0]["model"]
    
    return sort_df.round(4), best_model

In [42]:
models

[LinearRegression(),
 DecisionTreeRegressor(),
 RandomForestRegressor(max_depth=3)]

In [43]:
res_df, best_model = algo_evaluation(models, xtrain.values, ytrain.values, xtest.values, ytest.values)

{'model_name': 'LinearRegression', 'model': LinearRegression(), 'rmse_train': np.float64(39.013395814398386), 'mse_train': np.float64(1522.0450529709176), 'r2_train': 0.0010222994918025474, 'rmse_test': np.float64(39.04781985913539), 'mse_test': np.float64(1524.732235751488), 'r2_test': 0.0007746420852517977, 'r2_cv': np.float64(0.0007978036529403098)}
{'model_name': 'DecisionTreeRegressor', 'model': DecisionTreeRegressor(), 'rmse_train': np.float64(0.8213958544282826), 'mse_train': np.float64(0.6746911496719683), 'r2_train': 0.9995571738090558, 'rmse_test': np.float64(56.2521504173904), 'mse_test': np.float64(3164.3044265807152), 'r2_test': -1.0737104844136578, 'r2_cv': np.float64(-1.0588312224345107)}


  return fit_method(estimator, *args, **kwargs)


{'model_name': 'RandomForestRegressor', 'model': RandomForestRegressor(max_depth=3), 'rmse_train': np.float64(39.00970248618719), 'mse_train': np.float64(1521.7568880608387), 'r2_train': 0.0012114333933751498, 'rmse_test': np.float64(39.040267429461466), 'mse_test': np.float64(1524.14248096387), 'r2_test': 0.001161134824712362, 'r2_cv': np.float64(0.0008177600781813288)}


In [44]:
res_df

Unnamed: 0,model_name,model,rmse_train,mse_train,r2_train,rmse_test,mse_test,r2_test,r2_cv
0,RandomForestRegressor,"(DecisionTreeRegressor(max_depth=3, max_featur...",39.0097,1521.7569,0.0012,39.0403,1524.1425,0.0012,0.0008
1,LinearRegression,LinearRegression(),39.0134,1522.0451,0.001,39.0478,1524.7322,0.0008,0.0008
2,DecisionTreeRegressor,DecisionTreeRegressor(),0.8214,0.6747,0.9996,56.2522,3164.3044,-1.0737,-1.0588


In [45]:
best_model

#### From above metrics we can say Random Forest is the best model here evaluated using RMSE.

### Use out of Sample Prediction

In [48]:
xnew = pd.read_csv("Backpack Prediction(KNN)_test.csv")
xnew.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),id,Price
0,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147,300000,
1,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105,300001,
2,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799,300002,
3,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036,300003,
4,Under Armour,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953,300004,


In [49]:
pre

In [50]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__Compartments,num__Weight Capacity (kg),cat__Brand_Adidas,cat__Brand_Jansport,cat__Brand_Nike,cat__Brand_Puma,cat__Brand_Under Armour,cat__Material_Canvas,cat__Material_Leather,cat__Material_Nylon,...,cat__Waterproof_Yes,cat__Style_Backpack,cat__Style_Messenger,cat__Style_Tote,cat__Color_Black,cat__Color_Blue,cat__Color_Gray,cat__Color_Green,cat__Color_Pink,cat__Color_Red
0,-1.19124,0.379165,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.538408,-0.641138,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.230268,-0.89299,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-1.53717,0.064173,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.19124,-1.166023,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [51]:
preds = best_model.predict(xnew_pre)
preds



array([81.28671998, 82.39961601, 82.35096152, ..., 81.6945118 ,
       81.57372281, 80.92728841])

In [52]:
res = xnew[["id"]]
res

Unnamed: 0,id
0,300000
1,300001
2,300002
3,300003
4,300004
...,...
199995,499995
199996,499996
199997,499997
199998,499998


In [53]:
res["Price"] = preds.round(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res["Price"] = preds.round(2)


In [54]:
res

Unnamed: 0,id,Price
0,300000,81.29
1,300001,82.40
2,300002,82.35
3,300003,81.66
4,300004,79.88
...,...,...
199995,499995,80.23
199996,499996,78.16
199997,499997,81.69
199998,499998,81.57


In [55]:
res.to_csv("Submission.csv", index=False)