# Target encoding for continents (bean origin/company location)

In [None]:
df_train = pd.get_dummies(df_train, columns= ['origin_continent']).copy()
df_train.rename(columns ={'origin_continent_AF':'AF', 'origin_continent_AS':'AS','origin_continent_NA':'NA',
            'origin_continent_OC':'OC','origin_continent_SA':'SA','origin_continent_Unknown':'Un'
                         }, inplace= True)

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [5]:
df = pd.read_csv('combined_train.csv', keep_default_na=False)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2224 entries, 0 to 2223
Data columns (total 36 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        2224 non-null   int64  
 1   company                           2224 non-null   object 
 2   company_location                  2224 non-null   object 
 3   review_date                       2224 non-null   int64  
 4   country_of_bean_origin            2224 non-null   object 
 5   specific_bean_origin_or_bar_name  2224 non-null   object 
 6   cocoa_percent                     2224 non-null   float64
 7   rating                            2224 non-null   float64
 8   counts_of_ingredients             2224 non-null   int64  
 9   taste                             2224 non-null   object 
 10  vanilla                           2224 non-null   int64  
 11  salt                              2224 non-null   int64  
 12  sugar 

In [7]:
df['company_continent'].unique()

array(['NA', 'EU', 'OC', 'SA', 'AS', 'AF'], dtype=object)

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,company,company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,rating,counts_of_ingredients,taste,...,floral,intense,spicy,sour,molasses,woody,vanilla_taste,sticky,fruit,smokey
0,0,5150,United States,2019,Madagascar,"Bejofo Estate, batch 1",76.0,3.75,3,"cocoa, blackberry, full body",...,0,0,0,0,0,0,0,0,1,0
1,1,5150,United States,2019,Dominican republic,"Zorzal, batch 1",76.0,3.5,3,"cocoa, vegetal, savory",...,0,0,0,0,0,0,0,0,0,0
2,2,5150,United States,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,3.25,3,"rich cocoa, fatty, bready",...,0,0,0,0,0,0,0,0,0,0
3,3,A. Morin,France,2012,Peru,Peru,63.0,3.75,4,"fruity, melon, roasty",...,0,0,0,0,0,0,0,0,1,0
4,4,A. Morin,France,2012,Bolivia,Bolivia,70.0,3.5,4,"vegetal, nutty",...,0,0,0,0,0,0,0,0,0,0


In [10]:
df.drop(columns = ['Unnamed: 0'])

Unnamed: 0,company,company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,rating,counts_of_ingredients,taste,vanilla,...,floral,intense,spicy,sour,molasses,woody,vanilla_taste,sticky,fruit,smokey
0,5150,United States,2019,Madagascar,"Bejofo Estate, batch 1",76.0,3.75,3,"cocoa, blackberry, full body",0,...,0,0,0,0,0,0,0,0,1,0
1,5150,United States,2019,Dominican republic,"Zorzal, batch 1",76.0,3.50,3,"cocoa, vegetal, savory",0,...,0,0,0,0,0,0,0,0,0,0
2,5150,United States,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,3.25,3,"rich cocoa, fatty, bready",0,...,0,0,0,0,0,0,0,0,0,0
3,A. Morin,France,2012,Peru,Peru,63.0,3.75,4,"fruity, melon, roasty",0,...,0,0,0,0,0,0,0,0,1,0
4,A. Morin,France,2012,Bolivia,Bolivia,70.0,3.50,4,"vegetal, nutty",0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2219,Zotter,Austria,2014,Blend,Raw,80.0,2.75,4,"waxy, cloying, vegetal",0,...,0,0,0,0,0,0,0,0,0,0
2220,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75.0,3.75,3,"strong nutty, marshmallow",0,...,0,0,0,0,0,0,0,0,0,0
2221,Zotter,Austria,2018,Belize,Maya Mtn,72.0,3.50,3,"muted, roasty, accessible",0,...,0,0,0,0,0,0,0,0,0,0
2222,Zotter,Austria,2018,Congo,Mountains of the Moon,70.0,3.25,3,"fatty, mild nuts, mild fruit",0,...,0,0,0,0,0,0,0,0,1,0


# 1. Encode the bean origin continent

## 1.1 Decide on whether an __additive smoothing__ is needed

Check the count of each bean origin, the mean and std of each origin. 

In [17]:
Count_BO = df.groupby('origin_continent')['rating'].count()

In [24]:
Count_BO

origin_continent
AF         309
AS         139
NA         694
OC          91
SA         851
Unknown    140
Name: rating, dtype: int64

In [19]:
type(Count_BO)

pandas.core.series.Series

In [21]:
Mu_BO = df.groupby('origin_continent')['rating'].mean()
Sigma_BO = df.groupby('origin_continent')['rating'].std()

In [22]:
Mu_BO

origin_continent
AF         3.206311
AS         3.181655
NA         3.198631
OC         3.222527
SA         3.215041
Unknown    3.082143
Name: rating, dtype: float64

In [23]:
Sigma_BO

origin_continent
AF         0.423671
AS         0.349438
NA         0.397442
OC         0.391668
SA         0.452636
Unknown    0.584914
Name: rating, dtype: float64

In [25]:
Sigma_BO.values/Sigma_BO.values

array([1., 1., 1., 1., 1., 1.])

In [26]:
df_BO_summary = pd.DataFrame({'Bean origin': Count_BO.index, 
                              'Count': Count_BO.values,
                              'Mean of ratings': Mu_BO.values,
                              'Std of ratings': Sigma_BO.values,
                              'c.o.v': Sigma_BO.values/Mu_BO.values})

In [27]:
df_BO_summary

Unnamed: 0,Bean origin,Count,Mean of ratings,Std of ratings,c.o.v
0,AF,309,3.206311,0.423671,0.132137
1,AS,139,3.181655,0.349438,0.109829
2,,694,3.198631,0.397442,0.124254
3,OC,91,3.222527,0.391668,0.121541
4,SA,851,3.215041,0.452636,0.140787
5,Unknown,140,3.082143,0.584914,0.189775


Go ahead without smoothing first

In [31]:
# alternative coding
stats_BO = df['rating'].groupby(df['origin_continent']).agg(['count', 'mean', 'std'])

In [32]:
stats_BO

Unnamed: 0_level_0,count,mean,std
origin_continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AF,309,3.206311,0.423671
AS,139,3.181655,0.349438
,694,3.198631,0.397442
OC,91,3.222527,0.391668
SA,851,3.215041,0.452636
Unknown,140,3.082143,0.584914


## 1.2 Encode

In [33]:
df['origin_continent'] = df['origin_continent'].map(Mu_BO)

In [36]:
df['origin_continent']

0       3.206311
1       3.198631
2       3.206311
3       3.215041
4       3.215041
          ...   
2219    3.082143
2220    3.215041
2221    3.198631
2222    3.206311
2223    3.082143
Name: origin_continent, Length: 2224, dtype: float64

# 2. Encode the company continent

In [37]:
stats_CC = df['rating'].groupby(df['company_continent']).agg(['count', 'mean', 'std'])
stats_CC

Unnamed: 0_level_0,count,mean,std
company_continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AF,25,3.05,0.462106
AS,100,3.1925,0.349702
EU,683,3.224378,0.453565
,1175,3.187319,0.42924
OC,85,3.294118,0.395617
SA,156,3.145833,0.439903


Use an __additive smoothing__ to account for AF's mean and OC's mean<br>
set $m = 100$: there must be at least 100 values for the sample mean to overtake the global mean.

In [38]:
# https://maxhalford.github.io/blog/target-encoding/
def calc_smooth_mean(df, by, on, m):
    # Compute the global mean
    mean = df[on].mean()

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)

    # Replace each value by the according smoothed mean
    return df[by].map(smooth)

In [41]:
df['company_continent'] = calc_smooth_mean(df = df, by = 'company_continent', on = 'rating', m = 100)

In [43]:
df['company_continent']

0       3.188201
1       3.188201
2       3.188201
3       3.221081
4       3.221081
          ...   
2219    3.221081
2220    3.221081
2221    3.221081
2222    3.221081
2223    3.221081
Name: company_continent, Length: 2224, dtype: float64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2224 entries, 0 to 2223
Data columns (total 36 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        2224 non-null   int64  
 1   company                           2224 non-null   object 
 2   company_location                  2224 non-null   object 
 3   review_date                       2224 non-null   int64  
 4   country_of_bean_origin            2224 non-null   object 
 5   specific_bean_origin_or_bar_name  2224 non-null   object 
 6   cocoa_percent                     2224 non-null   float64
 7   rating                            2224 non-null   float64
 8   counts_of_ingredients             2224 non-null   int64  
 9   taste                             2224 non-null   object 
 10  vanilla                           2224 non-null   int64  
 11  salt                              2224 non-null   int64  
 12  sugar 