In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

from xgboost import XGBRFRegressor


In [2]:
#loading the data

#X_train_full = pd.read_csv("train.csv", index_col= "id")
#X_training_extra_full = pd.read_csv("training_extra.csv", index_col= "id")
X_test_full = pd.read_csv("test.csv", index_col= "id")

X_train_full = pd.read_csv("training_extra.csv", index_col= "id")

In [3]:
#setting the target and features

y = X_train_full["Price"]
X = X_train_full.drop("Price", axis = 1)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size= 0.8, test_size= 0.2, random_state= 0)

In [4]:
X_train.head()

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2164168,Jansport,Polyester,Large,5.0,No,Yes,,Red,6.668811
2791140,Adidas,Polyester,Medium,8.0,No,No,,Red,23.311326
2780243,Puma,Canvas,Small,7.0,No,No,Messenger,Pink,29.674513
4173761,Adidas,Canvas,Small,9.0,Yes,No,Tote,Gray,13.588812
854040,Puma,Canvas,Small,8.0,No,Yes,Messenger,Red,28.545519


In [5]:
#getting all obj columns in the data

obj_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
obj_cols


['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [6]:
#Getting all unique values found in the Object/Categorical columns

unique_labels = [X_train[col].unique() for col in X_train[obj_cols]]

for i in unique_labels:
    print(i)

['Jansport' 'Adidas' 'Puma' 'Nike' 'Under Armour' nan]
['Polyester' 'Canvas' 'Leather' 'Nylon' nan]
['Large' 'Medium' 'Small' nan]
['No' 'Yes' nan]
['Yes' 'No' nan]
[nan 'Messenger' 'Tote' 'Backpack']
['Red' 'Pink' 'Gray' 'Black' 'Green' 'Blue' nan]


In [7]:
#We select all the columns from the df that are not object type 

cat_X_train = X_train[obj_cols]
cat_X_val = X_val[obj_cols]

num_X_train = X_train.select_dtypes(exclude = ["object"])
num_X_val = X_val.select_dtypes(exclude = ["object"])

#numerical_X_train
#cat_X_train


In [8]:
#checking how many missing values there are in the data
X_train.isnull().sum()

Brand                   93537
Material                81934
Size                    65022
Compartments                0
Laptop Compartment      72918
Waterproof              69766
Style                   76840
Color                   99068
Weight Capacity (kg)     1331
dtype: int64

In [9]:
cat_imputer = SimpleImputer(strategy= "most_frequent")
cat_X_train =pd.DataFrame(cat_imputer.fit_transform(X_train[obj_cols]))
cat_X_val = pd.DataFrame(cat_imputer.transform(X_val[obj_cols]))

num_inputer = SimpleImputer(strategy= "mean")
num_X_train = pd.DataFrame(num_inputer.fit_transform((num_X_train)))
num_X_val = pd.DataFrame(num_inputer.transform(num_X_val))


In [10]:
#One hot incoding the categorical columns for train and validaiton data
OH_imputer = OneHotEncoder(handle_unknown="ignore", sparse_output= False)

OH_X_train_cols = pd.DataFrame(OH_imputer.fit_transform(cat_X_train))
OH_X_val_cols = pd.DataFrame(OH_imputer.transform(cat_X_val))

#putting the index back after OH incoding gets rid of them
OH_X_train_cols.index = X_train.index
OH_X_val_cols.index = X_val.index

#setting the numemerical cols to have the same index in preparation to concatinate
num_X_train.index = X_train.index
num_X_val.index = X_val.index

#giving the num cols their column names back
num_X_train = num_X_train.rename(columns={0: "Compartments", 1: "Weight Capacity (kg)"})
num_X_val = num_X_val.rename(columns={0: "Compartments", 1: "Weight Capacity (kg)"})

#insuring the columns are "str" dtype
OH_X_train_cols.columns = OH_X_train_cols.columns.astype("str")
OH_X_val_cols.columns = OH_X_val_cols.columns.astype("str")

#putting the new OH encoded columns together with the numerical ones
X_train_ready = pd.concat([num_X_train, OH_X_train_cols], axis = 1)
X_val_ready = pd.concat([num_X_val, OH_X_val_cols], axis = 1)

In [11]:
model = XGBRFRegressor()

model.fit(X_train_ready, y_train)

predictions = model.predict(X_val_ready)

In [12]:
RMSE = root_mean_squared_error(y_val, predictions)
print("Root Mean Squared Error Score: ", RMSE)

Root Mean Squared Error Score:  38.889697063242664


# Submission

## Test data preprocessing

In [13]:
#number of nan values per column
X_test_full.isnull().sum()

Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64

In [14]:
cat_X_test = X_test_full[obj_cols]
num_X_test = X_test_full.select_dtypes(exclude = ["object"])

cat_X_test = pd.DataFrame(cat_imputer.transform(pd.DataFrame(cat_X_test)))
num_X_test = pd.DataFrame(num_inputer.transform(num_X_test))

In [15]:
OH_X_test_cols = pd.DataFrame(OH_imputer.fit_transform(cat_X_test))

#putting the index back after OH incoding gets rid of them
OH_X_test_cols.index = X_test_full.index

#setting the numemerical cols to have the same index in preparation to concatinate
num_X_test.index = X_test_full.index

#giving the num cols their column names back
num_X_test= num_X_test.rename(columns={0: "Compartments", 1: "Weight Capacity (kg)"})

#insuring the columns are "str" dtype
OH_X_test_cols.columns = OH_X_test_cols.columns.astype("str")

#putting the new OH encoded columns together with the numerical ones
X_test_ready = pd.concat([num_X_test, OH_X_test_cols], axis = 1)

In [16]:
test_prediction = model.predict(X_test_ready)

In [17]:
submission = pd.DataFrame({'id': X_test_full.index, 'Price': test_prediction})
submission

Unnamed: 0,id,Price
0,300000,81.833290
1,300001,83.080635
2,300002,81.452499
3,300003,81.117950
4,300004,78.784607
...,...,...
199995,499995,79.698769
199996,499996,79.776421
199997,499997,82.574570
199998,499998,81.801010


In [18]:
submission.to_csv('submission.csv', index=False)