In [None]:
#import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv("test.csv")

In [None]:
df_train.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [None]:
df_test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [None]:
# Save the id from test data
test_ids = df_test["id"].copy()
df_test.drop("id", inplace=True, axis=1)
df_train.drop("id", inplace=True, axis=1)

In [None]:
df_train.shape

(300000, 10)

In [None]:
#Show the quantity of null value
df_train.isnull().sum()

Unnamed: 0,0
Brand,9705
Material,8347
Size,6595
Compartments,0
Laptop Compartment,7444
Waterproof,7050
Style,7970
Color,9950
Weight Capacity (kg),138
Price,0


In [None]:
categorical_columns = ["Brand", "Material", "Size", "Laptop Compartment", "Waterproof", "Style", "Color"]

In [None]:
#Impute
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df_train[column] = le.fit_transform(df_train[column].astype(str))
    df_test[column] = le.transform(df_test[column].astype(str))
    label_encoders[column] = le


imputer_cat = KNNImputer(n_neighbors=5, weights="uniform", metric="nan_euclidean")
df_train[categorical_columns] = imputer_cat.fit_transform(df_train[categorical_columns])
df_test[categorical_columns] = imputer_cat.transform(df_test[categorical_columns])


for column in categorical_columns:
    df_train[column] = label_encoders[column].inverse_transform(df_train[column].astype(int))
    df_test[column] = label_encoders[column].inverse_transform(df_test[column].astype(int))


numeric_columns = df_train.select_dtypes(include=[np.number]).columns
numeric_columns = [col for col in numeric_columns if col != "Price"]

imputer_num = KNNImputer(n_neighbors=5, weights="distance")  # K-Mean
df_train[numeric_columns] = imputer_num.fit_transform(df_train[numeric_columns])
df_test[numeric_columns] = imputer_num.transform(df_test[numeric_columns])

In [None]:
df_train.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [None]:
#spilt the data to train & test
X = df_train.drop("Price", axis =1)
y = df_train["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error


lgbm = LGBMRegressor(n_estimators=1000, learning_rate=0.1, max_depth=3, boosting_type='gbdt', device="cpu")
# Convert categorical columns to 'category' dtype
cat_features = ["Brand", "Material", "Size", "Laptop Compartment", "Waterproof", "Style", "Color"]
for col in cat_features:
    X_train[col] = X_train[col].astype("category")
    X_test[col] = X_test[col].astype("category")

# Now train LightGBM
lgbm.fit(X_train, y_train, categorical_feature=cat_features)
y_pred = lgbm.predict(X_test)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008858 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 305
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 9
[LightGBM] [Info] Start training from score 81.448481


In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 38.92985286890283


In [None]:
for col in cat_features:
    df_test[col] =df_test[col].astype("category")

test_prediction = lgbm.predict(df_test)
output = pd.DataFrame({'id': test_ids, 'Price': test_prediction})
output.to_csv('submission.csv', index=False)
print("submission file is downloaded")

submission file is downloaded
