In [1]:
import numpy as np 
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.feature_selection import SelectPercentile, chi2
import joblib

In [2]:
df = pd.read_csv('Luxury watch.csv')
df

Unnamed: 0,Brand,Model,Case Material,Strap Material,Movement Type,Water Resistance,Case Diameter (mm),Case Thickness (mm),Band Width (mm),Dial Color,Crystal Material,Complications,Power Reserve,Price (USD)
0,Rolex,Submariner,Stainless Steel,Stainless Steel,Automatic,300 meters,40.0,13.00,20.0,Black,Sapphire,Date,48 hours,9500
1,Omega,Seamaster,Titanium,Rubber,Automatic,600 meters,43.5,14.47,21.0,Blue,Sapphire,Date,60 hours,5800
2,Tag Heuer,Carrera,Stainless Steel,Leather,Automatic,100 meters,41.0,13.00,20.0,White,Sapphire,Chronograph,42 hours,4200
3,Breitling,Navitimer,Stainless Steel,Stainless Steel,Automatic,30 meters,43.0,14.25,22.0,Black,Sapphire,Chronograph,70 hours,7900
4,Cartier,Tank Solo,Stainless Steel,Leather,Quartz,30 meters,31.0,6.05,20.0,Silver,Sapphire,,,2800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,Breguet,Classique,18k Rose Gold,Leather,Automatic,30 meters,38.5,8.25,20.0,White,Sapphire,Date,38 hours,21500
503,Blancpain,Fifty Fathoms,Stainless Steel,Fabric,Automatic,300 meters,45.0,15.50,23.0,Black,Sapphire,Date,120 hours,13500
504,Longines,Master Collection,Stainless Steel,Leather,Automatic,30 meters,38.5,9.70,20.0,Blue,Sapphire,Date,64 hours,1800
505,Vacheron Constantin,Overseas,Stainless Steel,Stainless Steel,Automatic,150 meters,41.5,11.00,22.0,Blue,Sapphire,Date,40 hours,19000


In [3]:
df.dropna(inplace=True)

In [4]:
df['Price (USD)'] = df['Price (USD)'].map(lambda x: x.replace(',',''))

In [5]:
targetcol = 'Price (USD)'
X = df.loc[:,~df.columns.isin([targetcol])]
y = df.loc[:,df.columns.isin([targetcol])]
catcols = list(X.select_dtypes(include=['object']).columns)
numcols = list(X.select_dtypes(include=['float64','int']).columns)

In [6]:
numcols

['Case Diameter (mm)', 'Case Thickness (mm)', 'Band Width (mm)']

In [7]:
catcols

['Brand',
 'Model',
 'Case Material',
 'Strap Material',
 'Movement Type',
 'Water Resistance',
 'Dial Color',
 'Crystal Material',
 'Complications',
 'Power Reserve']

In [8]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant")),
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ("selector", SelectPercentile(chi2, percentile=50)),
])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numcols),
        ("cat", categorical_transformer, catcols),
    ]
)

# Linear Regression baseline

In [9]:
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("LR", LinearRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train, y_train)
print("model r2 score: %.3f" % pipeline.score(X_test, y_test))

model r2 score: 0.840


# DecisionTree Regressor baseline

In [10]:
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("DTR", DecisionTreeRegressor())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train, y_train)
print("model r2 score: %.3f" % pipeline.score(X_test, y_test))

model r2 score: 0.666


# RandomForest Regressor baseline

In [11]:
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("RFR", RandomForestRegressor())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train, y_train)
print("model r2 score: %.3f" % pipeline.score(X_test, y_test))

model r2 score: 0.736


  return fit_method(estimator, *args, **kwargs)


# ExtraTreeRegressor baseline

In [12]:
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("XTR", ExtraTreesRegressor())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train, y_train)
print("model r2 score: %.3f" % pipeline.score(X_test, y_test))

model r2 score: 0.776


  return fit_method(estimator, *args, **kwargs)


In [13]:
joblib.dump(pipeline, "model.joblib")

['model.joblib']

In [14]:
joblib.dump(list(X.columns), 'model_column_names.joblib')

['model_column_names.joblib']