In [1]:
# !pip install -r ../model_deployment/code/requirements.txt

In [19]:
import numpy as np 
import pandas as pd
import joblib

from transformer import LowercaseTransformer
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [20]:
df = pd.read_csv('Luxury watch.csv')
df

Unnamed: 0,Brand,Model,Case Material,Strap Material,Movement Type,Water Resistance,Case Diameter (mm),Case Thickness (mm),Band Width (mm),Dial Color,Crystal Material,Complications,Power Reserve,Price (USD)
0,Rolex,Submariner,Stainless Steel,Stainless Steel,Automatic,300 meters,40.0,13.00,20.0,Black,Sapphire,Date,48 hours,9500
1,Omega,Seamaster,Titanium,Rubber,Automatic,600 meters,43.5,14.47,21.0,Blue,Sapphire,Date,60 hours,5800
2,Tag Heuer,Carrera,Stainless Steel,Leather,Automatic,100 meters,41.0,13.00,20.0,White,Sapphire,Chronograph,42 hours,4200
3,Breitling,Navitimer,Stainless Steel,Stainless Steel,Automatic,30 meters,43.0,14.25,22.0,Black,Sapphire,Chronograph,70 hours,7900
4,Cartier,Tank Solo,Stainless Steel,Leather,Quartz,30 meters,31.0,6.05,20.0,Silver,Sapphire,,,2800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,Breguet,Classique,18k Rose Gold,Leather,Automatic,30 meters,38.5,8.25,20.0,White,Sapphire,Date,38 hours,21500
503,Blancpain,Fifty Fathoms,Stainless Steel,Fabric,Automatic,300 meters,45.0,15.50,23.0,Black,Sapphire,Date,120 hours,13500
504,Longines,Master Collection,Stainless Steel,Leather,Automatic,30 meters,38.5,9.70,20.0,Blue,Sapphire,Date,64 hours,1800
505,Vacheron Constantin,Overseas,Stainless Steel,Stainless Steel,Automatic,150 meters,41.5,11.00,22.0,Blue,Sapphire,Date,40 hours,19000


### Make the target column, water resistance and power reserve to be of type float instead of object

In [21]:
df['Price (USD)'] = df['Price (USD)'].apply(lambda x: float(str(x).replace(',','')))

df['Water Resistance'] = df['Water Resistance'].apply(lambda x: float(str(x).replace('meters', '')))

# Function to convert days to hours
def convert_days_to_hours(value):
    if pd.notna(value):  # Check if the value is not NaN
        if 'days' in value:
            days = int(value.split()[0])
            return days * 24
        elif 'hours' in value:
            return int(value.split()[0])
        else: return np.nan
    else:
        return value  # Return NaN if the input is NaN
# Apply the function to the 'Power Reserve' column
df['Power Reserve'] = df['Power Reserve'].apply(convert_days_to_hours)
df['Power Reserve'] = df['Power Reserve'].apply(lambda x: float(str(x).replace('hours', '')))

### Remove rows that contain nan values

In [22]:
# df.dropna(inplace=True)
df.dropna(subset = ['Price (USD)'], inplace=True)

### Remove duplicated rows

In [23]:
df = df.drop_duplicates()

### Remove outliers

In [24]:
df = df.drop(df[df["Power Reserve"] > 1000].index)

In [25]:
df = df.drop(df[df["Water Resistance"] > 1250].index)

### Create column transformer

In [26]:
targetcol = 'Price (USD)'
X = df.loc[:,~df.columns.isin([targetcol])]
y = df.loc[:,df.columns.isin([targetcol])]
catcols = list(X.select_dtypes(include=['object']).columns)
numcols = list(X.select_dtypes(include=['float64','int']).columns)

In [27]:
numcols

['Water Resistance',
 'Case Diameter (mm)',
 'Case Thickness (mm)',
 'Band Width (mm)',
 'Power Reserve']

In [28]:
catcols

['Brand',
 'Model',
 'Case Material',
 'Strap Material',
 'Movement Type',
 'Dial Color',
 'Crystal Material',
 'Complications']

In [29]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("lowercase", LowercaseTransformer()),  # Custom transformer for lowercase
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numcols),
        ("cat", categorical_transformer, catcols),
    ]
)

# Models

### Split the dataset

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Define model

In [31]:
gb = GradientBoostingRegressor()

In [32]:
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("model", gb)]
)
pipeline.fit(X, y)

In [33]:
joblib.dump(pipeline, "model.joblib")

['model.joblib']