# Rolex Data Exploration Notebook

In [13]:
import numpy as np 
import pandas as pd
import re
from collections import Counter
from functools import reduce

# Read in and Explore Data

In [15]:
df = pd.read_csv('../src/data/dataWithCurrencyVer002.csv')
# Drop the extra column 
df = df.drop('Unnamed: 0', axis = 1)
df.head()

Unnamed: 0,listing__statPrice,product-subtitle,Model,Box,Papers,Age,Movement,ConditionGrade,CaseSize,Case,Dial,Bracelet,LOT,Location,Seller
0,11923.9872,Datejust36,126234,Yes,Yes,2022,automatic,AAA,36mm,StainlessSteelandWhiteGold,SlateRomanNumeral,StainlessSteelJubilee,#004576,"MiltonKeynes,Buckinghamshire,UnitedKingdom",
1,34778.296,Day-Date40,228238,Yes,Yes,2022,automatic,AA,40mm,YellowGold,WhiteBaton,YellowGoldPresident,#,"BishopsStortford,Herts,UnitedKingdom",
2,15978.142848,Sea-Dweller,126660,Yes,Yes,2017,automatic,AAA,43mm,StainlessSteel,Black,StainlessSteelOyster,#3724,"Leicester,Leicestershire,UnitedKingdom",
3,16243.948396,Yacht-Master40,126622,Yes,Yes,2022,automatic,AAA,40mm,StainlessSteelandPlatinum,Slate,StainlessSteelOyster,#,"MiltonKeynes,Buckinghamshire,UnitedKingdom",
4,16450.6237,Submariner,126610LV,Yes,Yes,2022,automatic,AAA,41mm,StainlessSteel,Black,StainlessSteelOyster,#004597,"Buckinghamshire,UnitedKingdom",


In [18]:
# 864 entries in the df 
len(df)

864

In [17]:
# There are 22 values for Case Size 
df.CaseSize.value_counts()

40mm                           445
41mm                           116
36mm                            81
42mm                            68
44mm                            40
31mm                            28
43mm                            24
34mm                            12
39mm                            10
37mm                            10
26mm                             8
28mm                             5
29mm                             4
28mmx47mm                        2
35mm                             2
20mmx40mm                        2
41                               2
StainlessSteel                   1
38mm                             1
41MM                             1
StainlessSteelandYellowGold      1
32mm                             1
Name: CaseSize, dtype: int64

In [26]:
# There are 143 different values for Dial 
df.Dial.value_counts()

Black                          319
BlackBaton                      65
White                           44
Blue                            44
WhiteBaton                      40
                              ... 
SundustBaguetteDiamondBaton      1
IceBlueDiamondBaton              1
SalmonArabicNumeral              1
WhiteBatonDial                   1
BlackGilt                        1
Name: Dial, Length: 143, dtype: int64

In [47]:
# Original list of the 143 descriptors 
list(df.Dial.value_counts().index)

['Black',
 'BlackBaton',
 'White',
 'Blue',
 'WhiteBaton',
 'BlueBaton',
 'SilverBaton',
 'GreenBaton',
 'Green',
 'SlateRomanNumeral',
 'TurquoiseBaton',
 'ChampagneBaton',
 'ChocolateBaton',
 'BlackArabicNumeral',
 'WhiteRomanNumeral',
 'PinkBaton',
 'WhiteArabicNumeral',
 'Platinum',
 'IceBlueBaton',
 'Silver',
 'YellowBaton',
 'Champagne',
 'BlackwithDiamond',
 'SilverRomanNumeral',
 'SundustBaton',
 'SilverDiamond',
 'SteelandBlackBaton',
 'D-blue',
 'MintGreenBaton',
 'Steel/Black',
 'MotherofPearlwithDiamond',
 'BlueGraduated',
 'BlackArabic',
 'SteelBaton',
 'BlueArabicNumeral',
 'Chocolate',
 'Z-blue',
 'Meteorite',
 'Rhodium',
 'Slate',
 'Salmon',
 'BlueRomanNumeral',
 'Yellow',
 'RhodiumBaton',
 'WhiteDial',
 'SundustwithBaguette',
 'RolexMotifwithDiamond',
 'BlackDiamond',
 'PinkRomanNumeral',
 'Blue/BlackGradient',
 'BlackRomanNumeral',
 'BlueBlackGradient',
 'CoralRedBaton',
 'BlackMotherofPearlDiamond',
 'WhiteArabic',
 'ChampagneDiamond',
 'BlackDial',
 'BlackArabicNume

In [30]:
# Their approach: DialCounter gives the same thing as value_counts 
dial = list(df.Dial.values)
dialCounter = Counter(dial)
len(dialCounter)

143

In [38]:
dialCounter['Black']

319

In [31]:
keyWords = set()
for key in dialCounter:
    words = re.findall('[A-Z][^A-Z]*', key)
    for w in words:
        if 'with' in w:
            keyWords = keyWords.union(w.split('with'))
        else:
            keyWords = keyWords.union(words)

In [40]:
len(keyWords)

108

In [41]:
typos = {}

cleanKeyWords = set()
for word in keyWords:
    word = word.replace('eee','ee')
    word = re.sub('[^a-zA-Z]','',word)
    if word[-3:] in {'and','And', 'set', 'Set', 'amp'}:
        word=word[:-3]
    if word[-4:] in {'with','With','gold','Dial','dial'}:
        word=word[:-4]
    if word[-2:] in {'of'}:
        word=word[:-2]
    if word[-1:] in {'/'}:
        word=word[:-1]
    if word[:-1] in keyWords:
        word='' # remove plural form of word
    if word in typos:
        word=typos[word]
    if 1<len(word):
        cleanKeyWords.add(word)

In [50]:
# Cleaned descriptors: Now there are 73 descriptors 
cleanKeyWords

{'Anthracite',
 'Applied',
 'Arabic',
 'Azzurro',
 'Baguette',
 'Baton',
 'Blac',
 'Blue',
 'Bright',
 'Brown',
 'Candy',
 'Champagne',
 'Chocolate',
 'Coral',
 'Cream',
 'Dblue',
 'Diamond',
 'Ecru',
 'Flower',
 'Gilt',
 'Gold',
 'Golden',
 'Gradient',
 'Graduated',
 'Grape',
 'Green',
 'Grey',
 'Honeycomb',
 'Hour',
 'Ice',
 'Indices',
 'Intense',
 'Jubilee',
 'Lacquer',
 'Lapis',
 'Lazuli',
 'Lotus',
 'Markers',
 'Marks',
 'Meteorite',
 'Midnight',
 'Mint',
 'Mother',
 'Motif',
 'Numeral',
 'Olive',
 'Palm',
 'Panda',
 'Pav',
 'Pearl',
 'Pink',
 'Platinum',
 'Polar',
 'Purple',
 'Racing',
 'Red',
 'Rhodium',
 'Rolex',
 'Roman',
 'Ruby',
 'Salmon',
 'Sapphire',
 'Silver',
 'Slate',
 'Steel',
 'Stick',
 'Sunburst',
 'Sundust',
 'Turquoise',
 'White',
 'Yellow',
 'Zblue',
 'diamonds'}

In [51]:
# Now using the cleanKeyWords 
newDialCounter = {}
for word in cleanKeyWords:
        newDialCounter[word]=0
for i in range(len(df)):
    actual = df.loc[i,'Dial']
    for word in cleanKeyWords:
        if word in actual:
            newDialCounter[word]+=1

In [52]:
# I think this counts how many times each word appears in the df 
newDialCounter

{'Gradient': 6,
 'Sapphire': 1,
 'Motif': 8,
 'Zblue': 0,
 'White': 103,
 'Golden': 2,
 'Mother': 12,
 'Numeral': 62,
 'Graduated': 3,
 'Lotus': 1,
 'Stick': 1,
 'Pearl': 12,
 'Green': 35,
 'Meteorite': 7,
 'Midnight': 1,
 'Baguette': 7,
 'Dblue': 0,
 'Azzurro': 1,
 'Brown': 3,
 'Cream': 1,
 'Applied': 1,
 'Panda': 1,
 'Intense': 1,
 'Red': 3,
 'Indices': 1,
 'Markers': 2,
 'Pink': 10,
 'Racing': 1,
 'Marks': 1,
 'Grape': 2,
 'Lapis': 1,
 'Flower': 1,
 'Anthracite': 1,
 'Platinum': 5,
 'Arabic': 27,
 'Silver': 40,
 'Blue': 105,
 'Candy': 1,
 'Bright': 2,
 'Jubilee': 2,
 'Blac': 432,
 'Roman': 40,
 'Turquoise': 17,
 'Mint': 4,
 'Steel': 9,
 'Hour': 1,
 'Rhodium': 5,
 'Pav': 1,
 'Yellow': 6,
 'Polar': 1,
 'Ruby': 1,
 'Diamond': 37,
 'Coral': 4,
 'diamonds': 1,
 'Chocolate': 12,
 'Purple': 1,
 'Ice': 9,
 'Gold': 4,
 'Sunburst': 2,
 'Baton': 258,
 'Sundust': 8,
 'Palm': 1,
 'Champagne': 20,
 'Honeycomb': 1,
 'Lacquer': 1,
 'Grey': 3,
 'Olive': 1,
 'Ecru': 1,
 'Gilt': 1,
 'Lazuli': 1,
 'Sal

In [59]:
# groups for recoding
groups = { 'allDiamond': ['Pav'],
           'preciousStone': ['Lazuli', 'Meteorite', 'Lapis', 'Mother', 'Pearl'],
           'someDiamonds': ['Baguette', 'Jubilee', 'diamonds', 'Applied', 'Diamond', 'Indices']    
         }

In [60]:
# Creates new dictionary from previous one that gives a group as value for each descriptor as a key 
whichGroup = {}
for group in groups:
    for word in groups[group]:
        whichGroup[word]=group

In [62]:
whichGroup

{'Pav': 'allDiamond',
 'Lazuli': 'preciousStone',
 'Meteorite': 'preciousStone',
 'Lapis': 'preciousStone',
 'Mother': 'preciousStone',
 'Pearl': 'preciousStone',
 'Baguette': 'someDiamonds',
 'Jubilee': 'someDiamonds',
 'diamonds': 'someDiamonds',
 'Applied': 'someDiamonds',
 'Diamond': 'someDiamonds',
 'Indices': 'someDiamonds'}

In [65]:
# I'm not sure what this function does...

n = len(df)

for key in groups.keys():
    # This creates new columns in df that are all 0s 
    df[key] = [0]*n
# Note: 
groupsFlatten = set(reduce(lambda a,b:a+b, groups.values()))
for i in range(len(df)):
    actual = df.loc[i,'Dial']
    for word in groupsFlatten:
        if word in actual:
            df.loc[i,whichGroup[word]] = 1
df = df.drop('Dial', axis=1)

In [73]:
set(reduce(lambda a,b: a+b, groups.values()))

{'Applied',
 'Baguette',
 'Diamond',
 'Indices',
 'Jubilee',
 'Lapis',
 'Lazuli',
 'Meteorite',
 'Mother',
 'Pav',
 'Pearl',
 'diamonds'}

In [66]:
df

Unnamed: 0,listing__statPrice,product-subtitle,Model,Box,Papers,Age,Movement,ConditionGrade,CaseSize,Case,Bracelet,LOT,Location,Seller,allDiamond,preciousStone,someDiamonds
0,11923.987200,Datejust36,126234,Yes,Yes,2022,automatic,AAA,36mm,StainlessSteelandWhiteGold,StainlessSteelJubilee,#004576,"MiltonKeynes,Buckinghamshire,UnitedKingdom",,0,0,0
1,34778.296000,Day-Date40,228238,Yes,Yes,2022,automatic,AA,40mm,YellowGold,YellowGoldPresident,#,"BishopsStortford,Herts,UnitedKingdom",,0,0,0
2,15978.142848,Sea-Dweller,126660,Yes,Yes,2017,automatic,AAA,43mm,StainlessSteel,StainlessSteelOyster,#3724,"Leicester,Leicestershire,UnitedKingdom",,0,0,0
3,16243.948396,Yacht-Master40,126622,Yes,Yes,2022,automatic,AAA,40mm,StainlessSteelandPlatinum,StainlessSteelOyster,#,"MiltonKeynes,Buckinghamshire,UnitedKingdom",,0,0,0
4,16450.623700,Submariner,126610LV,Yes,Yes,2022,automatic,AAA,41mm,StainlessSteel,StainlessSteelOyster,#004597,"Buckinghamshire,UnitedKingdom",,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,16204.425000,SeaDwellerDeepsea,116660,Yes,Yes,2015,automatic,,44mm,StainlessSteel,StainlessSteel,#00035,"London,UnitedKingdom",,0,0,0
860,24134.250000,Daytona,16520,No,No,1997,automatic,,40mm,StainlessSteel,StainlessSteel,#00018,"Windsor,Berkshire,UnitedKingdom",,0,0,0
861,19446.367500,GMTMasterII,126710BLRO,Yes,Yes,2021,automatic,,40mm,StainlessSteel,StainlessSteelJubilee,#00009,"Canterbury,Kent,UnitedKingdom",,0,0,0
862,20687.625000,Submariner,116610LV,Yes,Yes,2020,automatic,,40mm,StainlessSteel,StainlessSteelOyster,#00031,"London,UnitedKingdom",,0,0,0
