In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

from xgboost import XGBRFRegressor


In [2]:
#loading the data

X_train_full = pd.read_csv("train.csv", index_col= "id")
X_training_extra_full = pd.read_csv("training_extra.csv", index_col= "id")
X_test_full = pd.read_csv("test.csv", index_col= "id")


In [3]:
#setting the target and features

y = X_train_full["Price"]
X = X_train_full.drop("Price", axis = 1)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size= 0.8, test_size= 0.2, random_state= 0)

In [4]:
X_train.head()

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
154452,Under Armour,Polyester,Small,8.0,No,Yes,Backpack,Red,16.817078
297220,,Leather,Medium,10.0,,Yes,Backpack,Blue,16.750179
177,Under Armour,Nylon,Medium,8.0,Yes,Yes,Messenger,Green,13.262331
44603,Adidas,Polyester,Small,10.0,No,No,Tote,Gray,29.694404
158177,Puma,Nylon,Small,2.0,No,Yes,Messenger,Blue,11.291522


In [5]:
#getting all obj columns in the data

obj_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
obj_cols


['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [6]:
#Getting all unique values found in the Object/Categorical columns

unique_labels = [X_train[col].unique() for col in X_train[obj_cols]]

for i in unique_labels:
    print(i)

['Under Armour' nan 'Adidas' 'Puma' 'Jansport' 'Nike']
['Polyester' 'Leather' 'Nylon' nan 'Canvas']
['Small' 'Medium' 'Large' nan]
['No' nan 'Yes']
['Yes' 'No' nan]
['Backpack' 'Messenger' 'Tote' nan]
['Red' 'Blue' 'Green' 'Gray' 'Pink' nan 'Black']


In [7]:
#We select all the columns from the df that are not object type 

cat_X_train = X_train[obj_cols]
cat_X_val = X_val[obj_cols]

num_X_train = X_train.select_dtypes(exclude = ["object"])
num_X_val = X_val.select_dtypes(exclude = ["object"])

#numerical_X_train
#cat_X_train


In [8]:
#checking how many missing values there are in the data
X_train.isnull().sum()

Brand                   7767
Material                6693
Size                    5274
Compartments               0
Laptop Compartment      5938
Waterproof              5697
Style                   6399
Color                   7972
Weight Capacity (kg)     110
dtype: int64

In [9]:
cat_imputer = SimpleImputer(strategy= "most_frequent")
cat_X_train =pd.DataFrame(cat_imputer.fit_transform(X_train[obj_cols]))
cat_X_val = pd.DataFrame(cat_imputer.transform(X_val[obj_cols]))

num_inputer = SimpleImputer(strategy= "mean")
num_X_train = pd.DataFrame(num_inputer.fit_transform((num_X_train)))
num_X_val = pd.DataFrame(num_inputer.transform(num_X_val))


In [10]:
cat_X_val.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

In [11]:
#One hot incoding the categorical columns for train and validaiton data
OH_imputer = OneHotEncoder(handle_unknown="ignore", sparse_output= False)

OH_X_train_cols = pd.DataFrame(OH_imputer.fit_transform(cat_X_train))
OH_X_val_cols = pd.DataFrame(OH_imputer.transform(cat_X_val))

#putting the index back after OH incoding gets rid of them
OH_X_train_cols.index = X_train.index
OH_X_val_cols.index = X_val.index

#setting the numemerical cols to have the same index in preparation to concatinate
num_X_train.index = X_train.index
num_X_val.index = X_val.index

#giving the num cols their column names back
num_X_train = num_X_train.rename(columns={0: "Compartments", 1: "Weight Capacity (kg)"})
num_X_val = num_X_val.rename(columns={0: "Compartments", 1: "Weight Capacity (kg)"})

#insuring the columns are "str" dtype
OH_X_train_cols.columns = OH_X_train_cols.columns.astype("str")
OH_X_val_cols.columns = OH_X_val_cols.columns.astype("str")

#putting the new OH encoded columns together with the numerical ones
X_train_ready = pd.concat([num_X_train, OH_X_train_cols], axis = 1)
X_val_ready = pd.concat([num_X_val, OH_X_val_cols], axis = 1)

In [12]:
OH_X_val_cols.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
dtype: int64

In [13]:
model = XGBRFRegressor()

model.fit(X_train_ready, y_train)

predictions = model.predict(X_val_ready)

In [14]:
MAE = mean_absolute_error(y_val, predictions)
RMSE = root_mean_squared_error(y_val, predictions)

print("MAE SCORE: ", MAE)
print("\nRoot Mean Squared Error Score: ", RMSE)

MAE SCORE:  33.68667046834461

Root Mean Squared Error Score:  38.91548931136094


# Submission

## Test data preprocessing

In [15]:
#number of nan values per column
X_test_full.isnull().sum()

Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64

In [16]:
X_test_full.shape

(200000, 9)

In [17]:
cat_X_test = X_test_full[obj_cols]
num_X_test = X_test_full.select_dtypes(exclude = ["object"])

cat_X_test = pd.DataFrame(cat_imputer.transform(pd.DataFrame(cat_X_test)))
num_X_test = pd.DataFrame(num_inputer.transform(num_X_test))

In [18]:
num_X_test.isnull().sum()
cat_X_test.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

In [19]:
OH_X_test_cols = pd.DataFrame(OH_imputer.fit_transform(cat_X_test))

#putting the index back after OH incoding gets rid of them
OH_X_test_cols.index = X_test_full.index

#setting the numemerical cols to have the same index in preparation to concatinate
num_X_test.index = X_test_full.index

#giving the num cols their column names back
num_X_test= num_X_test.rename(columns={0: "Compartments", 1: "Weight Capacity (kg)"})

#insuring the columns are "str" dtype
OH_X_test_cols.columns = OH_X_test_cols.columns.astype("str")

#putting the new OH encoded columns together with the numerical ones
X_test_ready = pd.concat([num_X_test, OH_X_test_cols], axis = 1)

In [20]:
test_prediction = model.predict(X_test_ready)

In [21]:
submission = pd.DataFrame({'id': X_test_full.index, 'Price': test_prediction})
submission

Unnamed: 0,id,Price
0,300000,82.307327
1,300001,82.510323
2,300002,82.309326
3,300003,81.812820
4,300004,77.793205
...,...,...
199995,499995,78.578629
199996,499996,80.021179
199997,499997,82.634155
199998,499998,81.350471


In [22]:
submission.to_csv('submission.csv', index=False)