In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

In [2]:
df = pd.read_csv('data.csv')
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, errors='ignore')

In [3]:
df

Unnamed: 0,brand,type,model,dimension,color_array,finish,price
0,Cristofori,grand,G62L,182.88,['ebony'],['satin'],20999.0
1,Fazioli,grand,F308,304.80,['ebony'],"['polish', 'satin']",302000.0
2,Fazioli,grand,F308,304.80,"['white', 'other']","['polish', 'satin']",332300.0
3,Fazioli,grand,F308,304.80,"['walnut', 'cherry', 'mahogany']","['satin', 'polish']",362500.0
4,Fazioli,grand,F308,304.80,"['mahogany', 'macassar']","['satin', 'polish']",392900.0
...,...,...,...,...,...,...,...
1992,Brodmann,vertical,CE 118,119.38,['ebony'],['polish'],11890.0
1993,Brodmann,vertical,PE 130,132.08,['ebony'],['polish'],22190.0
1994,Brodmann,vertical,PE 126 Institutaional,124.46,['ebony'],['polish'],17490.0
1995,Sauter,vertical,122 Peter Maly Artes,121.92,"['palisander', 'macassar']",['polish'],63500.0


In [4]:
most_common_finish = df["finish"].mode().iloc[0]
print("Most common finish:", most_common_finish)

Most common finish: ['polish']


In [5]:
def clean_data(finish):
    if finish == '[]':
        return 'polish'
    else:
        return finish
df['finish'] = df['finish'].apply(clean_data)
df.to_csv('data.csv')

In [6]:
df = pd.get_dummies(df, columns=["type"])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1997 entries, 0 to 1996
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   brand          1997 non-null   object 
 1   model          1997 non-null   object 
 2   dimension      1997 non-null   float64
 3   color_array    1997 non-null   object 
 4   finish         1997 non-null   object 
 5   price          1997 non-null   float64
 6   type_grand     1997 non-null   bool   
 7   type_vertical  1997 non-null   bool   
dtypes: bool(2), float64(2), object(4)
memory usage: 97.6+ KB


In [8]:
X = df.drop("price", axis=1)
y = df["price"]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [10]:
X_train

Unnamed: 0,brand,model,dimension,color_array,finish,type_grand,type_vertical
743,Schimmel,C 189 Tradition,182.88,['white'],['polish'],True,False
159,Yamaha,GC2,152.40,['ebony'],"['polish', 'satin']",True,False
1166,Yamaha,b1 Continental,109.22,['white'],['polish'],False,True
318,Bosendorfer,230VC,213.36,['white'],['satin'],True,False
924,Blüthner,11 Ambassador,152.40,['walnut'],polish,True,False
...,...,...,...,...,...,...,...
835,Kayserburg,KA151,152.40,['ebony'],['polish'],True,False
1216,Steingraeber,122 T-SFM,121.92,['ebony'],"['polish', 'satin']",False,True
1653,Blüthner,A,124.46,"['walnut', 'other']",['polish'],False,True
559,Steinway & Sons,M,152.40,['pommele'],['polish'],True,False


In [11]:
from sklearn.preprocessing import OrdinalEncoder, MultiLabelBinarizer
import ast
def safe_parse(val):
    try:
        return ast.literal_eval(val)
    except Exception:
        return []
X_train["color_array"] = X_train["color_array"].apply(safe_parse)
X_train["finish"] = X_train["finish"].apply(safe_parse)

color_binarizer = MultiLabelBinarizer()
finish_binarizer = MultiLabelBinarizer()
color_encoded = pd.DataFrame(color_binarizer.fit_transform(X_train["color_array"]),
                             columns=[f"color_{c}" for c in color_binarizer.classes_],
                             index=X_train.index)
finish_encoded = pd.DataFrame(finish_binarizer.fit_transform(X_train["finish"]),
                              columns=[f"finish_{f}" for f in finish_binarizer.classes_],
                              index=X_train.index)

X_train_encoded = pd.concat([X_train.drop(["color_array", "finish"], axis=1),
                             color_encoded, finish_encoded], axis=1)

categorical_cols = ["brand", "model"]
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train_encoded[categorical_cols] = ordinal_encoder.fit_transform(X_train[categorical_cols])


In [12]:
# from sklearn.ensemble import RandomForestRegressor
# regressor = RandomForestRegressor(
#     n_estimators=200,      # More trees = better generalization
#     max_depth=20,  
#     min_samples_split = 5,# Let trees grow fully
#     min_samples_leaf=2,   # Default, but can tune
#     random_state=0,
#     n_jobs=-1              # Use all CPU cores
# )
# regressor.fit(X_train_encoded, y_train)

In [13]:
from sklearn.ensemble import HistGradientBoostingRegressor

gbm = HistGradientBoostingRegressor(max_iter=300, 
                                    random_state=0,
                                    l2_regularization = 1.0,
                                    learning_rate = 0.2,
                                    max_depth  = 5,
                                    min_samples_leaf = 10
                                    )
gbm.fit(X_train_encoded, y_train)


In [14]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
# }

# grid_search = GridSearchCV(
#     RandomForestRegressor(random_state=0, n_jobs=-1),
#     param_grid,
#     cv=5,
#     scoring='r2',
#     verbose=1
# )

# grid_search.fit(X_train_encoded, y_train)

# print("Best R2:", grid_search.best_score_)
# print("Best Params:", grid_search.best_params_)


In [15]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import HistGradientBoostingRegressor

# param_grid = {
#     'max_iter': [100, 200, 300],
#     'learning_rate': [0.05, 0.1, 0.2],
#     'max_depth': [None, 5, 10],
#     'min_samples_leaf': [20, 10, 5],
#     'l2_regularization': [0.0, 0.1, 1.0]
# }

# grid = GridSearchCV(
#     HistGradientBoostingRegressor(random_state=0),
#     param_grid,
#     scoring='r2',
#     cv=5,
#     n_jobs=-1,
#     verbose=1
# )
# grid.fit(X_train_encoded, y_train)

# print("Best R²:", grid.best_score_)
# print("Best Params:", grid.best_params_)

# best_model = grid.best_estimator_


In [16]:
# Parse array fields
X_test["color_array"] = X_test["color_array"].apply(safe_parse)
X_test["finish"] = X_test["finish"].apply(safe_parse)

# Transform color and finish arrays
color_encoded_test = pd.DataFrame(
    color_binarizer.transform(X_test["color_array"]),
    columns=[f"color_{c}" for c in color_binarizer.classes_],
    index=X_test.index
)

finish_encoded_test = pd.DataFrame(
    finish_binarizer.transform(X_test["finish"]),
    columns=[f"finish_{f}" for f in finish_binarizer.classes_],
    index=X_test.index
)

X_test_encoded = pd.concat(
    [X_test.drop(["color_array", "finish"], axis=1),
     color_encoded_test, finish_encoded_test],
    axis=1
)

X_test_encoded[categorical_cols] = ordinal_encoder.transform(X_test[categorical_cols])


In [17]:
# y_pred = regressor.predict(X_test_encoded)
# np.set_printoptions(precision=2)
# print(np.concatenate((
#     y_pred.reshape(-1, 1),
#     y_test.values.reshape(-1, 1)
# ), axis=1))

In [18]:
y_pred = gbm.predict(X_test_encoded)
np.set_printoptions(precision=0)
print(np.concatenate(
    (y_pred.reshape(-1, 1), np.array(y_test).reshape(-1, 1)),
    axis=1
))

[[194024. 190301.]
 [ 29211.  10752.]
 [ 40201.  53900.]
 [ 56301.  78803.]
 [ 98275. 107409.]
 [ 16429.  18552.]
 [ 21144.  19130.]
 [126939. 124128.]
 [ 17597.   6695.]
 [ 72044.  36207.]
 [113290. 114652.]
 [191409. 195303.]
 [208378. 256999.]
 [ 64734.  25956.]
 [125199. 112580.]
 [ 19575.  19460.]
 [ 15834.  14995.]
 [ 33352.  34646.]
 [146503. 147100.]
 [ 37127.  45900.]
 [ 15754.  14290.]
 [152574. 147334.]
 [ 36579.   7900.]
 [ 44376.  56671.]
 [197372. 187507.]
 [ 38227.  11872.]
 [ 26045.  21627.]
 [156501. 173040.]
 [225575. 200000.]
 [ 35584.  27199.]
 [ 22371.  28000.]
 [234678. 238885.]
 [ 64441.  15456.]
 [ 73423.  19190.]
 [ 50597.  41668.]
 [135099. 153452.]
 [ 24216.  27899.]
 [ 30879.  33699.]
 [ 35919.  36178.]
 [ 11080.  12499.]
 [ 17793.  42615.]
 [ 53311.  58322.]
 [ 81882.  78112.]
 [ 57333.  36207.]
 [217360. 198300.]
 [255677. 273800.]
 [ 64734.  30478.]
 [205504. 349999.]
 [ 42285.  47000.]
 [ 29901.  29937.]
 [ 43058.  43000.]
 [ 13906.  18552.]
 [174083. 21

In [19]:
# from sklearn.metrics import r2_score
# r2_score(y_test, y_pred)

In [20]:
r2 = gbm.score(X_test_encoded, y_test)
print(f"GBM R2 Score: {r2:.2f}")

GBM R2 Score: 0.88


In [21]:
import joblib

preprocessing_objects = {
    "model": gbm,
    "ordinal_encoder": ordinal_encoder,
    "color_binarizer": color_binarizer,
    "finish_binarizer": finish_binarizer,
}

joblib.dump(preprocessing_objects, "model_pipeline.pkl")

['model_pipeline.pkl']

In [22]:
import sklearn
print(sklearn.__version__)

1.6.1
