In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', None)

In [3]:
# read in the data

dataset = pd.read_csv('luxury_watches.csv')
display(dataset)

Unnamed: 0,watch_name,type_gender,brand,watch_color,watch_size,length_of_band,price
0,Rolex Datejust Combination Watch (16233),M,Rolex,yellow combination,18.0,36.0,7500.0
1,Cartier Benoit Gold Watch,F,Cartier,grey,16.0,28.0,7900.0
2,Rolex Milgauss Steel Watch (116400GV),M,Rolex,steel,17.0,40.0,9000.0
...,...,...,...,...,...,...,...
3165,Prada Gold Plated Watch,Both,Prada,brown,18.0,24.0,580.0
3166,Chanel Premier Steel Watch,F,Chanel,steel,17.0,18.5,1950.0
3167,Cartier Rotonde Gold Watch (W1556203),M,Cartier,brown,19.0,42.0,25500.0


# 1. Initial exploration

Explore the data in order to see what we need to do to clean the data

First I want to look for missing values. It might be that the missing values are not entered as NaN's and must be identified accordingly.

In [4]:
# define some variables

cols = dataset.columns
print(cols)

Index(['watch_name', 'type_gender', 'brand', 'watch_color', 'watch_size',
       'length_of_band', 'price'],
      dtype='object')


In [5]:
# check for missing values in each column

def check_nan(dataframe):
    df = pd.DataFrame([])
    for col in cols:

        nan_df = dataset[dataframe[col].isna() == True]
        df = pd.concat([df, nan_df], ignore_index=True)

    return df

In [6]:
pd.set_option('display.max_rows', None)
display(check_nan(dataset))
pd.set_option('display.max_rows', 7)

Unnamed: 0,watch_name,type_gender,brand,watch_color,watch_size,length_of_band,price
0,Hermes Kelly Steel Necklace Watch (KE1.210),,Hermes,other,17.0,20.0,600.0
1,Rolex Submariner Steel Watch (114060),,Rolex,other,17.0,40.0,12700.0
2,Rolex Combi Diamond Watch (79173),,Rolex,other,17.0,26.0,6500.0
3,Hermes H Hour Gold Plated Watch (HH1.501),,Hermes,other,17.0,26.0,1200.0
4,Gucci watch (150.5),,Gucci,other,17.0,42.5,360.0
5,Baume & Mercier Promise Diamond Watch,F,,white,13.0,34.0,800.0
6,Baume & Mercier Hampton Steel Watch (65647),M,,dark blue,19.0,38.0,1300.0
7,Hamilton Jazzmaster Gold Plated Watch,M,,brown,20.0,41.0,400.0
8,Hamilton Jazzmaster Seaview Steel Watch,M,,black,19.0,42.0,400.0
9,Hamilton Jazzmaster Steel Watch (H32705141),M,,steel,16.0,42.0,650.0


# 2. Manually insert type_gender

There are only five missing values relating to type_gender. These can easily be determined from a Google search and will be entered manually.

In [7]:
dataset[dataset['type_gender'].isna() == True]

Unnamed: 0,watch_name,type_gender,brand,watch_color,watch_size,length_of_band,price
217,Hermes Kelly Steel Necklace Watch (KE1.210),,Hermes,other,17.0,20.0,600.0
1017,Rolex Submariner Steel Watch (114060),,Rolex,other,17.0,40.0,12700.0
1211,Rolex Combi Diamond Watch (79173),,Rolex,other,17.0,26.0,6500.0
1533,Hermes H Hour Gold Plated Watch (HH1.501),,Hermes,other,17.0,26.0,1200.0
2835,Gucci watch (150.5),,Gucci,other,17.0,42.5,360.0


In [8]:
# change the type_gender
dataset.iloc[217, 1] = 'F'
dataset.iloc[1017, 1] = 'M'
dataset.iloc[1211, 1] = 'F'
dataset.iloc[1533, 1] = 'F'
dataset.iloc[2835, 1] = 'Both'

In [9]:
dataset[dataset['type_gender'].isna() == True]

Unnamed: 0,watch_name,type_gender,brand,watch_color,watch_size,length_of_band,price


# 3. Infer brand from watch_name

Some of the values for brand are missing. They should be available in the watch_name. We will try and isolate the brand from the watch_name and assign it accordingly.

In [10]:
pd.set_option('display.max_rows', None)
display(dataset[dataset['brand'].isna()])
pd.set_option('display.max_rows', 7)

Unnamed: 0,watch_name,type_gender,brand,watch_color,watch_size,length_of_band,price
81,Baume & Mercier Promise Diamond Watch,F,,white,13.0,34.0,800.0
162,Baume & Mercier Hampton Steel Watch (65647),M,,dark blue,19.0,38.0,1300.0
361,Hamilton Jazzmaster Gold Plated Watch,M,,brown,20.0,41.0,400.0
369,Hamilton Jazzmaster Seaview Steel Watch,M,,black,19.0,42.0,400.0
373,Hamilton Jazzmaster Steel Watch (H32705141),M,,steel,16.0,42.0,650.0
378,Hamilton Khaki Field Watches,M,,other,21.0,38.0,400.0
379,Hamilton Jazzmaster Steel Watch,M,,steel,17.0,36.0,600.0
447,Hamilton Khaki GMT Steel Watch,M,,steel,18.0,42.0,500.0
449,Breguet Marine Gold Watch,Both,,black,13.0,35.0,10000.0
453,Hamilton Khaki Field Mechanical Steel Watch,M,,other,19.0,38.0,350.0


In [11]:
pd.set_option('display.max_rows', None)
brands = pd.Series(dataset['brand'].unique())
brands = brands[brands.isna() == False]
brands = pd.concat([brands, pd.Series(['hamilton', 'rado', 'baume & mercier', 'tag heuer', 'harry winston', 'teg heuer', 'bulgari',
                                       'van kleep', 'roger duvira', 'lange & sohne', 'girard perregaux', 'perregaux', 'bell & ross', 'chaumé liens'])])
brands = brands.str.lower()
brands = brands.sort_values()
print(brands)
pd.set_option('display.max_rows', 7)

44        a lange & sohne
26        audemars piguet
2         baume & mercier
42            bell & rose
12            bell & ross
3               blancpain
39              boucheron
34                breguet
2               breitling
6                 bulgari
40               burberry
20                bvlgari
1                 cartier
6                  chanel
36                chaumet
13           chaumé liens
16                chopard
37         christian dior
28            chronoswiss
41                damiani
17                  fendi
22              ferragamo
35          franck muller
8     frederique constant
10       girard perregaux
21       girard-perregaux
46                 graham
9                   gucci
0                hamilton
4           harry winston
32           harrywinston
4                  hermes
25                 hublot
7                     iwc
5        jaeger lecoultre
9           lange & sohne
14               longines
15          louis vuitton
43        ma

In [12]:
def check_brand(watch_name, brands = brands):
    result = 'missing'
    for brand in brands:
        if brand in watch_name.lower():
            result = brand
    return result

In [13]:
print(check_brand('Hamilton Jazzmaster Maestro Chrono Steel Watch'))

hamilton


In [14]:
'tag heuer' in 'Tag Heuer Aquaracer Chrono Steel Watch (CAF2112)'.lower()

True

In [15]:
dataset['brand'] = dataset['watch_name'].apply(check_brand)

In [16]:
pd.set_option('display.max_rows', None)
display(dataset[dataset['brand'] == 'missing'])
pd.set_option('display.max_rows', 8)

Unnamed: 0,watch_name,type_gender,brand,watch_color,watch_size,length_of_band,price


# 4. Fix spelling errors in brand

In [17]:
pd.set_option('display.max_rows', None)
print(dataset['brand'].unique())
pd.set_option('display.max_rows', 8)

['rolex' 'cartier' 'breitling' 'blancpain' 'hermes' 'jaeger lecoultre'
 'chanel' 'iwc' 'frederique constant' 'gucci' 'patek philippe' 'tag heuer'
 'omega' 'tudor' 'longines' 'louis vuitton' 'chopard' 'fendi' 'piaget'
 'baume & mercier' 'bulgari' 'perregaux' 'ferragamo' 'tiffany' 'hublot'
 'audemars piguet' 'van kleep' 'bvlgari' 'chronoswiss' 'oris' 'montblanc'
 'vacheron constantin' 'harry winston' 'panerai' 'hamilton' 'breguet'
 'franck muller' 'rado' 'chaumet' 'christian dior' 'zenith' 'boucheron'
 'burberry' 'damiani' 'teg heuer' 'bell & ross' 'maurice lacroix'
 'lange & sohne' 'van cleef & arpels' 'ulysse nardin' 'chaumé liens'
 'graham' 'other' 'roger dubuis' 'roger duvira' 'prada']


In [19]:
pd.set_option('display.max_rows', None)
print(brands)
pd.set_option('display.max_rows', 8)

44        a lange & sohne
26        audemars piguet
2         baume & mercier
42            bell & rose
12            bell & ross
3               blancpain
39              boucheron
34                breguet
2               breitling
6                 bulgari
40               burberry
20                bvlgari
1                 cartier
6                  chanel
36                chaumet
13           chaumé liens
16                chopard
37         christian dior
28            chronoswiss
41                damiani
17                  fendi
22              ferragamo
35          franck muller
8     frederique constant
10       girard perregaux
21       girard-perregaux
46                 graham
9                   gucci
0                hamilton
4           harry winston
32           harrywinston
4                  hermes
25                 hublot
7                     iwc
5        jaeger lecoultre
9           lange & sohne
14               longines
15          louis vuitton
43        ma

In [23]:
brands_spelling_dict = dict({'bell & rose':'bell & ross', 'bvlgari':'bulgari', 'chaumé liens':'chaumet', 'christian dior':'dior',
                             'girard-perregaux':'girard perregaux', 'harrywinston':'harry winston', 'lange & sohne':'a lange & sohne',
                             'perregaux':'girard, perregaux', 'roger duvira':'roger dubuis', 'tagheuer':'tag heuer', 'teg heuer':'tag heuer',
                             'van kleep':'van cleef & arpels'})

In [24]:
dataset['brand'] = dataset['brand'].replace(brands_spelling_dict)

In [25]:
dataset

Unnamed: 0,watch_name,type_gender,brand,watch_color,watch_size,length_of_band,price
0,Rolex Datejust Combination Watch (16233),M,rolex,yellow combination,18.0,36.0,7500.0
1,Cartier Benoit Gold Watch,F,cartier,grey,16.0,28.0,7900.0
2,Rolex Milgauss Steel Watch (116400GV),M,rolex,steel,17.0,40.0,9000.0
3,Cartier Claire de Diamonds Gold Watch,F,cartier,green,16.0,32.0,15000.0
...,...,...,...,...,...,...,...
3164,Hublot Big Bang Chrono Gold Ceramic Watch,M,hublot,black,16.0,4.0,24000.0
3165,Prada Gold Plated Watch,Both,prada,brown,18.0,24.0,580.0
3166,Chanel Premier Steel Watch,F,chanel,steel,17.0,18.5,1950.0
3167,Cartier Rotonde Gold Watch (W1556203),M,cartier,brown,19.0,42.0,25500.0


In [29]:
pd.set_option('display.max_rows', None)
print(pd.Series(dataset['brand'].unique()).sort_values())
pd.set_option('display.max_rows', 8)

45        a lange & sohne
25        audemars piguet
19        baume & mercier
43            bell & ross
3               blancpain
40              boucheron
34                breguet
2               breitling
20                bulgari
41               burberry
1                 cartier
6                  chanel
37                chaumet
16                chopard
27            chronoswiss
42                damiani
38                   dior
17                  fendi
22              ferragamo
35          franck muller
8     frederique constant
21      girard, perregaux
47                 graham
9                   gucci
33               hamilton
31          harry winston
4                  hermes
24                 hublot
7                     iwc
5        jaeger lecoultre
14               longines
15          louis vuitton
44        maurice lacroix
29              montblanc
12                  omega
28                   oris
48                  other
32                panerai
10         p

# 5. Remove the brand name from the watch_name