In [1]:
import pandas as pd

In [19]:
df = pd.read_csv('data/museum_data.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)

df.head(10)

Unnamed: 0,Medium,Classification,Culture,Object Begin Date,Historical Period,height,diam,width,depth
0,cotton,Textiles,Indian,1730,Modern Times,296.5,-1.0,261.6,-1.0
1,gold,Armor,Anglo-American,1781,Modern Times,12.07,-1.0,11.43,-1.0
2,"alloy,copper,gold,iron,steel,wood",Swords,Japanese,1801,Contemporary Era,-1.0,-1.0,-1.0,105.5
3,"gold,steel",Armor,French,1525,Modern Times,-1.0,-1.0,-1.0,14.5
4,"gold,iron,silver",Sword,Japanese,1701,Modern Times,-1.0,8.3,-1.0,-1.0
5,"alloy,copper,glass,gold,steel,wood",Daggers,Japanese,1501,Modern Times,-1.0,-1.0,-1.0,39.2
6,"alloy,copper,gold,silver,steel,wood",Daggers,Japanese,1801,Contemporary Era,-1.0,-1.0,-1.0,34.4
7,"steel,wood",Daggers,Japanese,1350,Middle Ages,-1.0,-1.0,-1.0,35.1
8,wood,Daggers,Japanese,1801,Contemporary Era,-1.0,-1.0,-1.0,34.5
9,"steel,wood",Daggers,Japanese,1501,Modern Times,-1.0,-1.0,-1.0,43.9


### Data preprocessing: Medium, Classification and Culture

#### 1) Mean target encoding

In [18]:
df["Historical Period"].unique()

array(['Modern Times', 'Contemporary Era', 'Middle Ages', 'Antiquity'],
      dtype=object)

In [33]:
import matplotlib.pyplot as plt

# create intermediate column encoding the target variable (Historical period)
# ok to encode this way: distance between integers proportionnal to period difference
map_period = {"Contemporary Era":1., "Modern Times": 2., "Middle Ages":3., "Antiquity":4.}

df["num_period"] = [map_period[period] for period in df["Historical Period"]]



In [34]:
# mean target encoding on medium
tmp_medium = df.groupby(["Medium"]).describe()
tmp_medium

Unnamed: 0_level_0,Object Begin Date,Object Begin Date,Object Begin Date,Object Begin Date,Object Begin Date,Object Begin Date,Object Begin Date,Object Begin Date,height,height,...,depth,depth,num_period,num_period,num_period,num_period,num_period,num_period,num_period,num_period
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Medium,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
alloy,28.0,1413.392857,146.925615,1300.0,1300.0,1400.0,1400.0,1995.0,28.0,2.957143,...,0.4,2.9,28.0,2.892857,0.416270,1.0,3.0,3.0,3.0,3.0
"alloy,bronze",3.0,1100.000000,1039.230485,-100.0,800.0,1700.0,1700.0,1700.0,3.0,37.866667,...,-1.0,-1.0,3.0,2.666667,1.154701,2.0,2.0,2.0,3.0,4.0
"alloy,bronze,color,copper",4.0,1597.000000,0.000000,1597.0,1597.0,1597.0,1597.0,1597.0,4.0,-1.000000,...,-1.0,-1.0,4.0,2.000000,0.000000,2.0,2.0,2.0,2.0,2.0
"alloy,bronze,copper",59.0,409.491525,1084.370696,-2300.0,-800.0,500.0,1499.0,1875.0,59.0,3.073220,...,-1.0,5.7,59.0,3.186441,0.918668,1.0,2.0,3.0,4.0,4.0
"alloy,bronze,copper,gilt",1.0,1780.000000,,1780.0,1780.0,1780.0,1780.0,1780.0,1.0,139.700000,...,-1.0,-1.0,1.0,2.000000,,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ware,1390.0,1571.657554,352.858053,-1046.0,1500.0,1700.0,1800.0,2011.0,1390.0,12.053597,...,-1.0,56.5,1390.0,2.151079,0.613194,1.0,2.0,2.0,2.0,4.0
"ware,wood",14.0,1545.571429,148.155695,1392.0,1400.0,1567.0,1637.5,1822.0,14.0,17.335714,...,-1.0,38.1,14.0,2.357143,0.633324,1.0,2.0,2.0,3.0,3.0
wood,6065.0,1731.643034,280.596372,-1580.0,1733.0,1800.0,1850.0,2018.0,6065.0,13.219166,...,-1.0,1360.8,6065.0,1.702391,0.635345,1.0,1.0,2.0,2.0,4.0
"wood,woodblock",18.0,1187.888889,323.324075,800.0,960.0,1000.0,1446.5,1867.0,18.0,90.811111,...,-1.0,-1.0,18.0,2.777778,0.548319,1.0,3.0,3.0,3.0,3.0


In [42]:
#('num_period', 'mean') in tmp_medium.columns
tmp_medium[('num_period', 'mean')]

Medium
alloy                        2.892857
alloy,bronze                 2.666667
alloy,bronze,color,copper    2.000000
alloy,bronze,copper          3.186441
alloy,bronze,copper,gilt     2.000000
                               ...   
ware                         2.151079
ware,wood                    2.357143
wood                         1.702391
wood,woodblock               2.777778
woodblock                    1.333333
Name: (num_period, mean), Length: 1076, dtype: float64

In [40]:
tmp_medium.index

Index(['alloy', 'alloy,bronze', 'alloy,bronze,color,copper',
       'alloy,bronze,copper', 'alloy,bronze,copper,gilt',
       'alloy,bronze,copper,glass,gold,iron,paste', 'alloy,bronze,copper,gold',
       'alloy,bronze,copper,gold,iron', 'alloy,bronze,copper,gold,iron,silver',
       'alloy,bronze,copper,gold,silver',
       ...
       'terracotta', 'terracotta,ware', 'terracotta,wood', 'thread',
       'thread,wood', 'ware', 'ware,wood', 'wood', 'wood,woodblock',
       'woodblock'],
      dtype='object', name='Medium', length=1076)

In [43]:
map_medium = {medium_cat: medium_num for (medium_cat, medium_num) in zip(tmp_medium.index, tmp_medium[('num_period', 'mean')])}

df["medium_num"] = [map_medium[m] for m in df["Medium"]]
df.head()

Unnamed: 0,Medium,Classification,Culture,Object Begin Date,Historical Period,height,diam,width,depth,num_period,medium_num
0,cotton,Textiles,Indian,1730,Modern Times,296.5,-1.0,261.6,-1.0,2.0,1.854215
1,gold,Armor,Anglo-American,1781,Modern Times,12.07,-1.0,11.43,-1.0,2.0,2.927883
2,"alloy,copper,gold,iron,steel,wood",Swords,Japanese,1801,Contemporary Era,-1.0,-1.0,-1.0,105.5,1.0,1.666667
3,"gold,steel",Armor,French,1525,Modern Times,-1.0,-1.0,-1.0,14.5,2.0,1.914692
4,"gold,iron,silver",Sword,Japanese,1701,Modern Times,-1.0,8.3,-1.0,-1.0,2.0,2.166667


#### Classification

In [44]:
# mean target encoding on Classification
tmp_classif = df.groupby(["Classification"]).describe()
map_classif = {classif_cat: classif_num for (classif_cat, classif_num) in zip(tmp_classif.index, tmp_classif[('num_period', 'mean')])}
df["classif_num"] = [map_classif[m] for m in df["Classification"]]
df.head()

Unnamed: 0,Medium,Classification,Culture,Object Begin Date,Historical Period,height,diam,width,depth,num_period,medium_num,classif_num
0,cotton,Textiles,Indian,1730,Modern Times,296.5,-1.0,261.6,-1.0,2.0,1.854215,1.966078
1,gold,Armor,Anglo-American,1781,Modern Times,12.07,-1.0,11.43,-1.0,2.0,2.927883,2.295775
2,"alloy,copper,gold,iron,steel,wood",Swords,Japanese,1801,Contemporary Era,-1.0,-1.0,-1.0,105.5,1.0,1.666667,1.981467
3,"gold,steel",Armor,French,1525,Modern Times,-1.0,-1.0,-1.0,14.5,2.0,1.914692,2.295775
4,"gold,iron,silver",Sword,Japanese,1701,Modern Times,-1.0,8.3,-1.0,-1.0,2.0,2.166667,1.848797


#### Culture

In [45]:
# mean target encoding on Culture
tmp_culture = df.groupby(["Culture"]).describe()
map_culture = {culture_cat: culture_num for (culture_cat, culture_num) in zip(tmp_culture.index, tmp_culture[('num_period', 'mean')])}
df["culture_num"] = [map_culture[m] for m in df["Culture"]]
df.head()

Unnamed: 0,Medium,Classification,Culture,Object Begin Date,Historical Period,height,diam,width,depth,num_period,medium_num,classif_num,culture_num
0,cotton,Textiles,Indian,1730,Modern Times,296.5,-1.0,261.6,-1.0,2.0,1.854215,1.966078,1.656506
1,gold,Armor,Anglo-American,1781,Modern Times,12.07,-1.0,11.43,-1.0,2.0,2.927883,2.295775,2.0
2,"alloy,copper,gold,iron,steel,wood",Swords,Japanese,1801,Contemporary Era,-1.0,-1.0,-1.0,105.5,1.0,1.666667,1.981467,1.871935
3,"gold,steel",Armor,French,1525,Modern Times,-1.0,-1.0,-1.0,14.5,2.0,1.914692,2.295775,1.886682
4,"gold,iron,silver",Sword,Japanese,1701,Modern Times,-1.0,8.3,-1.0,-1.0,2.0,2.166667,1.848797,1.871935


In [48]:
# remove initial columns:
df_mean_target_encoding = df.drop(columns=["Medium", "Classification", "Culture", "num_period"], axis=1)
df_mean_target_encoding.head()

Unnamed: 0,Object Begin Date,Historical Period,height,diam,width,depth,medium_num,classif_num,culture_num
0,1730,Modern Times,296.5,-1.0,261.6,-1.0,1.854215,1.966078,1.656506
1,1781,Modern Times,12.07,-1.0,11.43,-1.0,2.927883,2.295775,2.0
2,1801,Contemporary Era,-1.0,-1.0,-1.0,105.5,1.666667,1.981467,1.871935
3,1525,Modern Times,-1.0,-1.0,-1.0,14.5,1.914692,2.295775,1.886682
4,1701,Modern Times,-1.0,8.3,-1.0,-1.0,2.166667,1.848797,1.871935


In [49]:
df_mean_target_encoding.to_csv("data/museum_data_mean_target_encoding.csv")