* https://www.kaggle.com/competitions/playground-series-s5e2

In [2]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

pd.set_option('future.no_silent_downcasting', True)

In [3]:
from zipfile import ZipFile

with ZipFile("backpack.zip") as bp:
    
    with bp.open("test.csv") as t:
        test_df = pd.read_csv(t)

    with bp.open("train.csv") as t:
        train_df = pd.read_csv(t)

    with bp.open("training_extra.csv") as t:
        training_extra_df = pd.read_csv(t)

In [4]:
train_df

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.643760,39.17320
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.937220,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...,...
299995,299995,Adidas,Leather,Small,9.0,No,No,Tote,Blue,12.730812,129.99749
299996,299996,Jansport,Leather,Large,6.0,No,Yes,Tote,Blue,26.633182,19.85819
299997,299997,Puma,Canvas,Large,9.0,Yes,Yes,Backpack,Pink,11.898250,111.41364
299998,299998,Adidas,Nylon,Small,1.0,No,Yes,Tote,Pink,6.175738,115.89080


In [5]:
train_df.isna().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [6]:
def one_hot(_df: pd.DataFrame, cols: list) -> pd.DataFrame:
    df = _df.copy()
    df = df.join( [ pd.get_dummies(df[col]).astype(int) for col in cols ] )
    df = df.drop(cols, axis=1)
    return df


def fillna_mode(_df: pd.DataFrame, cols: list) -> pd.DataFrame:
    df = _df.copy()
    for col in cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df


def fillna_mean(_df: pd.DataFrame, cols: list) -> pd.DataFrame:
    df = _df.copy()
    for col in cols:
        df[col] = df[col].fillna(df[col].mean())
    return df

def make_df(_df: pd.DataFrame) -> pd.DataFrame:
    df = _df.copy()

    df = fillna_mode(df, ["Laptop Compartment", "Waterproof", "Brand", "Material", "Size", "Style", "Color"])
    df = fillna_mean(df, ["Weight Capacity (kg)"])

    df["Laptop Compartment"] = df["Laptop Compartment"].replace({"No": False, "Yes": True}).astype(bool)
    df["Waterproof"] = df["Waterproof"].replace({"No": False, "Yes": True}).astype(bool)

    df = df.drop([
        "id"
    ], axis=1)

    # df["WeightCompartments"] = df["Weight Capacity (kg)"] * df["Compartments"]

    return df

def make_df_with_one_hot(_df: pd.DataFrame) -> pd.DataFrame:
    df = make_df(_df)

    df = one_hot(df, ["Brand", "Material", "Size", "Style", "Color"])

    return df

def get_xy(_df: pd.DataFrame, y_names: list) -> tuple[pd.DataFrame, pd.DataFrame]:
    df_x = _df.copy()

    df_y = df_x[y_names]
    df_x = df_x.drop(y_names, axis=1)

    return df_x, df_y

def normalize(_df: pd.DataFrame) -> pd.DataFrame:
    df = _df.copy()

    number_cols = [a for a, b in df.dtypes.items() if "float" in str(b) or "int" in str(b) ]

    df_number = df[number_cols]
    df_number = (df_number - df_number.min()) / (df_number.max() - df_number.min())
    df[number_cols] = df_number.astype("float64")

    
    cat_cols = [a for a, b in df.dtypes.items() if "float" not in str(b) and "int" not in str(b) and "bool" not in str(b) ]
    df[cat_cols] = df[cat_cols].astype("category")

    return df

In [8]:
# _train_df = make_df(train_df)
_train_df = make_df_with_one_hot(train_df)

train_x, train_y = get_xy(_train_df, ["Price"])
train_y.isna().sum()

Price    0
dtype: int64

In [9]:
_train_df_n = normalize(_train_df)
train_x_n = normalize(train_x)
train_y_n = normalize(train_y)

In [10]:
# _train_df_n["Weight Capacity (kg)"] = pd.cut( _train_df_n["Weight Capacity (kg)"], 10, labels=False)

In [11]:
_train_df_n.corr() #.abs() > 0.01

Unnamed: 0,Compartments,Laptop Compartment,Waterproof,Weight Capacity (kg),Price,Adidas,Jansport,Nike,Puma,Under Armour,...,Small,Backpack,Messenger,Tote,Black,Blue,Gray,Green,Pink,Red
Compartments,1.0,-0.003252,-0.004778,0.002541,-0.000131,0.002659,-0.00086,0.002752,-0.001719,-0.002992,...,-0.00225,-0.007422,0.002203,0.005106,0.001359,-7.3e-05,0.000714,0.000621,-0.001178,-0.001317
Laptop Compartment,-0.003252,1.0,-0.027695,0.003611,-0.001321,-0.003346,0.000299,0.001428,-0.00175,0.003552,...,0.000896,-0.00289,0.005303,-0.002568,-0.003201,-0.002203,0.005435,0.001852,0.000911,-0.003026
Waterproof,-0.004778,-0.027695,1.0,0.004568,-0.003984,-0.002535,-0.001854,0.003989,-0.001004,0.001547,...,0.00029,-0.00041,0.001428,-0.001057,7e-06,-0.000136,0.002567,-0.001988,0.001611,-0.002317
Weight Capacity (kg),0.002541,0.003611,0.004568,1.0,0.018013,0.001176,-0.001724,-0.000356,-0.005104,0.005788,...,-0.002697,-0.001777,0.000442,0.00131,-0.000687,-0.000431,0.00391,-0.001631,-0.001701,0.000612
Price,-0.000131,-0.001321,-0.003984,0.018013,1.0,-0.010527,0.004669,-0.001144,0.000464,0.007238,...,0.000233,-0.000149,0.000773,-0.000644,-0.009833,0.00667,-0.006427,0.010628,0.003158,-0.004422
Adidas,0.002659,-0.003346,-0.002535,0.001176,-0.010527,1.0,-0.263975,-0.267617,-0.26611,-0.275255,...,-0.001671,-0.001079,0.000793,0.000257,0.002142,-0.002014,-0.005333,-0.000212,0.002053,0.003305
Jansport,-0.00086,0.000299,-0.001854,-0.001724,0.004669,-0.263975,1.0,-0.233062,-0.23175,-0.239715,...,-0.00127,0.001374,-0.003768,0.002499,-0.000945,0.00265,-0.000832,-0.000815,0.001106,-0.001295
Nike,0.002752,0.001428,0.003989,-0.000356,-0.001144,-0.267617,-0.233062,1.0,-0.234947,-0.243022,...,0.001144,0.005082,-0.002334,-0.002649,-0.000721,0.000525,0.001441,0.001377,-0.003035,0.00071
Puma,-0.001719,-0.00175,-0.001004,-0.005104,0.000464,-0.26611,-0.23175,-0.234947,1.0,-0.241653,...,-0.000204,-0.00249,0.002442,-3.3e-05,-0.002167,-0.001568,0.002414,0.000386,0.002266,-0.001643
Under Armour,-0.002992,0.003552,0.001547,0.005788,0.007238,-0.275255,-0.239715,-0.243022,-0.241653,1.0,...,0.002078,-0.002757,0.002738,-7.1e-05,0.00149,0.000565,0.002663,-0.000714,-0.002482,-0.001317


In [12]:
_train_df2 = make_df_with_one_hot(training_extra_df)

train_x2, train_y2 = get_xy(_train_df2, ["Price"])

train_x_n2 = normalize(train_x2)
train_y_n2 = normalize(train_y2)

In [13]:
xgb = XGBRegressor(
    n_estimators=1000,
    # max_depth=6,
    learning_rate=0.01,
    n_jobs=-1,
    enable_categorical=True,
)

In [86]:
xgb.fit(train_x_n2, train_y_n2, eval_set=[(train_x_n, train_y_n)], verbose=True)

[0]	validation_0-rmse:0.28918
[1]	validation_0-rmse:0.28917
[2]	validation_0-rmse:0.28917
[3]	validation_0-rmse:0.28917
[4]	validation_0-rmse:0.28916
[5]	validation_0-rmse:0.28916
[6]	validation_0-rmse:0.28916
[7]	validation_0-rmse:0.28915
[8]	validation_0-rmse:0.28915
[9]	validation_0-rmse:0.28915
[10]	validation_0-rmse:0.28915
[11]	validation_0-rmse:0.28914
[12]	validation_0-rmse:0.28914
[13]	validation_0-rmse:0.28914
[14]	validation_0-rmse:0.28913
[15]	validation_0-rmse:0.28913
[16]	validation_0-rmse:0.28913
[17]	validation_0-rmse:0.28913
[18]	validation_0-rmse:0.28912
[19]	validation_0-rmse:0.28912
[20]	validation_0-rmse:0.28912
[21]	validation_0-rmse:0.28912
[22]	validation_0-rmse:0.28912
[23]	validation_0-rmse:0.28911
[24]	validation_0-rmse:0.28911
[25]	validation_0-rmse:0.28911
[26]	validation_0-rmse:0.28911
[27]	validation_0-rmse:0.28911
[28]	validation_0-rmse:0.28910
[29]	validation_0-rmse:0.28910
[30]	validation_0-rmse:0.28910
[31]	validation_0-rmse:0.28910
[32]	validation_0-

In [None]:
xgb.fit(train_x_n, train_y_n, eval_set=[(train_x_n2[:300_000], train_y_n2[:300_000])], verbose=True)

[0]	validation_0-rmse:0.28837
[1]	validation_0-rmse:0.28837
[2]	validation_0-rmse:0.28836
[3]	validation_0-rmse:0.28836
[4]	validation_0-rmse:0.28836
[5]	validation_0-rmse:0.28835
[6]	validation_0-rmse:0.28835
[7]	validation_0-rmse:0.28835
[8]	validation_0-rmse:0.28834
[9]	validation_0-rmse:0.28834
[10]	validation_0-rmse:0.28834
[11]	validation_0-rmse:0.28833
[12]	validation_0-rmse:0.28833
[13]	validation_0-rmse:0.28833
[14]	validation_0-rmse:0.28833
[15]	validation_0-rmse:0.28832
[16]	validation_0-rmse:0.28832
[17]	validation_0-rmse:0.28832
[18]	validation_0-rmse:0.28831
[19]	validation_0-rmse:0.28831
[20]	validation_0-rmse:0.28831
[21]	validation_0-rmse:0.28831
[22]	validation_0-rmse:0.28830
[23]	validation_0-rmse:0.28830
[24]	validation_0-rmse:0.28830
[25]	validation_0-rmse:0.28830
[26]	validation_0-rmse:0.28829
[27]	validation_0-rmse:0.28829
[28]	validation_0-rmse:0.28829
[29]	validation_0-rmse:0.28829
[30]	validation_0-rmse:0.28828
[31]	validation_0-rmse:0.28828
[32]	validation_0-

KeyboardInterrupt: 

In [153]:
rfr = RandomForestRegressor(
    n_estimators=100,
    n_jobs=-1,
)

In [17]:
train_y_n2["Price"].array.astype(float)

array([0.73415319, 0.84999793, 0.04721259, ..., 0.98655333, 0.05424215,
       0.68601474], shape=(3694318,))

In [154]:
rfr.fit(train_x_n, train_y_n["Price"].array.astype(float))

ValueError: Cannot cast object dtype to float32

In [14]:
from sklearn.metrics import root_mean_squared_error as RMSE

def Score(model, _df: pd.DataFrame, df_func = make_df):
    df = df_func(_df)

    df_x, df_y = get_xy(df, ["Price"])

    df_x_n = normalize(df_x)
    df_y_n = normalize(df_y)

    predict = model.predict(df_x_n) * 150
    return RMSE(df_y, predict)#, model.score(df_x_n, df_y_n)

In [77]:
Score(xgb, train_df)

(np.float64(39.765601684345185), 0.0016442537307739258)

In [135]:
Score(xgb, training_extra_df)

(np.float64(39.66071046120994), -2.765655517578125e-05)

In [15]:
def CalcY(model, _df: pd.DataFrame, df_func = make_df) -> pd.array:
    df_x = df_func(_df)
    df_x_n = normalize(df_x)
    return model.predict(df_x_n) * 150

In [16]:
def SaveCalcY(model, _df: pd.DataFrame, start=0, df_func = make_df):
    y = CalcY(model, _df, df_func)
    y_df = pd.DataFrame(y, index=range(start, start + len(y)))
    y_df = y_df.rename(columns={0: "Price"})
    y_df.to_csv(f"submission_{str(model.__class__.__name__)}.csv", index_label="id")

In [84]:
SaveCalcY(xgb, test_df, len(train_df))

In [59]:
SaveCalcY(rfr, test_df, len(train_df))

In [1]:
import keras
from keras import Sequential
from keras.layers import Dense, Input, Reshape, Conv2D, Conv2DTranspose, Flatten, MaxPool2D, Dropout

In [None]:
dim = 16

# model = Sequential([
#     Dense(1000, activation="relu"),
#     Dense(128, activation="relu"),
#     Dense(1, activation="sigmoid")
# ])

model = Sequential([
    Input((len( train_x_n.columns ),) ),
    Dense(dim ** 3, activation="relu"),
    Reshape((dim,) * 3),
    Conv2D(3, 3, 2, padding="same"),
    Conv2DTranspose(3, 3, 2, padding="same"),
    Flatten(),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="nadam",
    loss=keras.losses.mean_squared_error
)

In [195]:
model.summary()

In [202]:
model.fit(train_x_n, train_y_n, batch_size=32, epochs=10, validation_data=(train_x_n2[:100_000], train_y_n2[:100_000]))

Epoch 1/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 3ms/step - loss: 0.0834 - val_loss: 0.0829
Epoch 2/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - loss: 0.0833 - val_loss: 0.0832
Epoch 3/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 3ms/step - loss: 0.0832 - val_loss: 0.0829
Epoch 4/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 3ms/step - loss: 0.0832 - val_loss: 0.0829
Epoch 5/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step - loss: 0.0833 - val_loss: 0.0831
Epoch 6/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3ms/step - loss: 0.0830 - val_loss: 0.0830
Epoch 7/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 3ms/step - loss: 0.0834 - val_loss: 0.0830
Epoch 8/10
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step - loss: 0.0835 - val_loss: 0.0830
Epoch 9/10
[1m9

<keras.src.callbacks.history.History at 0x1c786cf5760>

In [203]:
Score(model, train_df, make_df_with_one_hot)

[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step


39.695945739746094

In [204]:
SaveCalcY(model, test_df, len(train_df), make_df_with_one_hot)

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step
