In [513]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [514]:
#Downloading data into my dataframe
df = pd.read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv'
)

In [515]:
df

Unnamed: 0,ref,company_manufacturer,company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,ingredients,most_memorable_characteristics,rating
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.50
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.00
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.00
...,...,...,...,...,...,...,...,...,...,...
2525,1205,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa","waxy, cloying, vegetal",2.75
2526,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C","strong nutty, marshmallow",3.75
2527,2036,Zotter,Austria,2018,Blend,"Dry Aged, 30 yr Anniversary bar",75%,"3- B,S,C","fatty, earthy, cocoa",3.00
2528,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C","fatty, mild nuts, mild fruit",3.25


I want to do a classification problem, lets take a look at the rating distribution.

In [516]:
ratings = pd.DataFrame(df['rating'].copy())
ratings

Unnamed: 0,rating
0,3.25
1,3.50
2,3.75
3,3.00
4,3.00
...,...
2525,2.75
2526,3.75
2527,3.00
2528,3.25


In [517]:
ratings.describe()

Unnamed: 0,rating
count,2530.0
mean,3.196344
std,0.445321
min,1.0
25%,3.0
50%,3.25
75%,3.5
max,4.0


The first quartile has a value of 3. So I think it's fair to assume any chocolate with less than 3 does not taste good.

In [518]:
ratings[ratings.rating < 3].count()

rating    566
dtype: int64

In [519]:
(112/2530)*566

25.05612648221344

Lets see if we can build a classification machine learning algorithm that will be able to predict if the chocolate will get a rating below 3 or not. Then we can say the machine learning algo can predict if a chocolate bar will taste bad or not :)

**Feature Engineering**

Creating a dummy variable that says if the chocolate got the rating 4 or not.

In [520]:
df.loc[df['rating'] < 3, 'bad_chocolate'] = 1
df.loc[df['rating'] >= 3, 'bad_chocolate'] = 0
df['bad_chocolate'].value_counts()

0.0    1964
1.0     566
Name: bad_chocolate, dtype: int64

Looks good!

In [521]:
#Removing the columns I dont need
df.columns

Index(['ref', 'company_manufacturer', 'company_location', 'review_date',
       'country_of_bean_origin', 'specific_bean_origin_or_bar_name',
       'cocoa_percent', 'ingredients', 'most_memorable_characteristics',
       'rating', 'bad_chocolate'],
      dtype='object')

In [522]:
columns_i_want = ['bad_chocolate',
       'country_of_bean_origin',
       'cocoa_percent', 
       'ingredients', 
       'most_memorable_characteristics'
]

df = df[columns_i_want]

In [523]:
df.dtypes

bad_chocolate                     float64
country_of_bean_origin             object
cocoa_percent                      object
ingredients                        object
most_memorable_characteristics     object
dtype: object

Changing cocoa percent to float and removing '%'

In [524]:
df['cocoa_percent'] = df['cocoa_percent'].str.replace('%', '').astype('float')

In [525]:
df.dtypes

bad_chocolate                     float64
country_of_bean_origin             object
cocoa_percent                     float64
ingredients                        object
most_memorable_characteristics     object
dtype: object

Success!

Time to make all type of country_of_bean_origin into dummy variables

In [526]:
df.country_of_bean_origin.describe()

count          2530
unique           62
top       Venezuela
freq            253
Name: country_of_bean_origin, dtype: object

Gonna first put them into a seperate df so I can but rare origins into its own column togheter with other rare origins.

In [527]:
bean_origin_dummies = pd.get_dummies(df.country_of_bean_origin)
bean_dummies_mean = bean_origin_dummies.describe().iloc[1,].reset_index()
bean_dummies_mean[bean_dummies_mean['mean'] < 0.01].count()

index    40
mean     40
dtype: int64

Quite a few countries gets used less than 1% of the time, lets make all those into the same column.

Lets extract all the country names that meets that criteria.

In [528]:
bean_dummies_mean[bean_dummies_mean['mean'] < 0.01]['index'].unique()

array(['Australia', 'Burma', 'Cameroon', 'China', 'Congo', 'Cuba',
       'DR Congo', 'El Salvador', 'Fiji', 'Gabon', 'Grenada', 'Honduras',
       'Indonesia', 'Ivory Coast', 'Jamaica', 'Liberia', 'Malaysia',
       'Martinique', 'Nigeria', 'Panama', 'Philippines', 'Principe',
       'Puerto Rico', 'Samoa', 'Sao Tome', 'Sao Tome & Principe',
       'Sierra Leone', 'Solomon Islands', 'Sri Lanka', 'St. Lucia',
       'St.Vincent-Grenadines', 'Sulawesi', 'Sumatra', 'Suriname',
       'Taiwan', 'Thailand', 'Tobago', 'Togo', 'Uganda', 'Vanuatu'],
      dtype=object)

In [529]:
rare_countries = ['Australia', 'Burma', 'Cameroon', 'China', 'Congo', 'Cuba',
       'DR Congo', 'El Salvador', 'Fiji', 'Gabon', 'Grenada', 'Honduras',
       'Indonesia', 'Ivory Coast', 'Jamaica', 'Liberia', 'Malaysia',
       'Martinique', 'Nigeria', 'Panama', 'Philippines', 'Principe',
       'Puerto Rico', 'Samoa', 'Sao Tome', 'Sao Tome & Principe',
       'Sierra Leone', 'Solomon Islands', 'Sri Lanka', 'St. Lucia',
       'St.Vincent-Grenadines', 'Sulawesi', 'Sumatra', 'Suriname',
       'Taiwan', 'Thailand', 'Tobago', 'Togo', 'Uganda', 'Vanuatu']

Done! now lets remove them and create a single dummy variable for all those countries.

In [530]:
pd.set_option('display.max_columns', None) #Doing this so I can see all columns

In [531]:
def create_rare_bean_origin_dummy(df, rare_countries):
    for country in rare_countries:
        if (df[country] == 1).any() == True:
            df.loc[df[country] == 1, 'rare_bean_origin'] = 1
    df.loc[df['rare_bean_origin'].isna(), 'rare_bean_origin'] = 0
    df['rare_bean_origin'] = df['rare_bean_origin'].astype('int')
    df.drop(rare_countries, axis=1, inplace=True)

In [532]:
create_rare_bean_origin_dummy(bean_origin_dummies, rare_countries)

In [533]:
bean_origin_dummies

Unnamed: 0,Belize,Blend,Bolivia,Brazil,Colombia,Costa Rica,Dominican Republic,Ecuador,Ghana,Guatemala,Haiti,India,Madagascar,Mexico,Nicaragua,Papua New Guinea,Peru,Tanzania,Trinidad,U.S.A.,Venezuela,Vietnam,rare_bean_origin
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2527,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


Looks good!

In [534]:
#Making all columns to lowercase
bean_origin_dummies.columns= bean_origin_dummies.columns.str.lower()
bean_origin_dummies

Unnamed: 0,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2527,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


Time to add the bean origin dummies to the main df!

In [535]:
df.join(bean_origin_dummies)

Unnamed: 0,bad_chocolate,country_of_bean_origin,cocoa_percent,ingredients,most_memorable_characteristics,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin
0,0.0,Tanzania,76.0,"3- B,S,C","rich cocoa, fatty, bready",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0.0,Dominican Republic,76.0,"3- B,S,C","cocoa, vegetal, savory",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,Madagascar,76.0,"3- B,S,C","cocoa, blackberry, full body",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0.0,Fiji,68.0,"3- B,S,C","chewy, off, rubbery",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0.0,Venezuela,72.0,"3- B,S,C","fatty, earthy, moss, nutty,chalky",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1.0,Blend,80.0,"4- B,S*,C,Sa","waxy, cloying, vegetal",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0.0,Colombia,75.0,"3- B,S,C","strong nutty, marshmallow",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2527,0.0,Blend,75.0,"3- B,S,C","fatty, earthy, cocoa",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0.0,Congo,70.0,"3- B,S,C","fatty, mild nuts, mild fruit",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


Looks good!

In [536]:
df = df.join(bean_origin_dummies)
df.drop('country_of_bean_origin', axis=1, inplace=True)
df

Unnamed: 0,bad_chocolate,cocoa_percent,ingredients,most_memorable_characteristics,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin
0,0.0,76.0,"3- B,S,C","rich cocoa, fatty, bready",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0.0,76.0,"3- B,S,C","cocoa, vegetal, savory",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,76.0,"3- B,S,C","cocoa, blackberry, full body",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0.0,68.0,"3- B,S,C","chewy, off, rubbery",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0.0,72.0,"3- B,S,C","fatty, earthy, moss, nutty,chalky",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1.0,80.0,"4- B,S*,C,Sa","waxy, cloying, vegetal",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0.0,75.0,"3- B,S,C","strong nutty, marshmallow",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2527,0.0,75.0,"3- B,S,C","fatty, earthy, cocoa",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0.0,70.0,"3- B,S,C","fatty, mild nuts, mild fruit",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


Success!

In [537]:
df.dtypes

bad_chocolate                     float64
cocoa_percent                     float64
ingredients                        object
most_memorable_characteristics     object
belize                              uint8
                                   ...   
trinidad                            uint8
u.s.a.                              uint8
venezuela                           uint8
vietnam                             uint8
rare_bean_origin                    int32
Length: 27, dtype: object

I notice here that bad_chocolate is a float, no need for that! So changing it to an integer.

In [538]:
df['bad_chocolate'] = df['bad_chocolate'].astype(int)

In [539]:
df.dtypes

bad_chocolate                       int32
cocoa_percent                     float64
ingredients                        object
most_memorable_characteristics     object
belize                              uint8
                                   ...   
trinidad                            uint8
u.s.a.                              uint8
venezuela                           uint8
vietnam                             uint8
rare_bean_origin                    int32
Length: 27, dtype: object

Success!

In [540]:
df.ingredients

0           3- B,S,C
1           3- B,S,C
2           3- B,S,C
3           3- B,S,C
4           3- B,S,C
            ...     
2525    4- B,S*,C,Sa
2526        3- B,S,C
2527        3- B,S,C
2528        3- B,S,C
2529        3- B,S,C
Name: ingredients, Length: 2530, dtype: object

In [541]:
df['ingredients'].unique()

array(['3- B,S,C', '4- B,S,C,L', '2- B,S', '4- B,S,C,V', '5- B,S,C,V,L',
       '6-B,S,C,V,L,Sa', '5-B,S,C,V,Sa', nan, '4- B,S,V,L', '2- B,S*',
       '1- B', '3- B,S*,C', '3- B,S,L', '3- B,S,V', '4- B,S*,C,L',
       '4- B,S*,C,Sa', '3- B,S*,Sa', '4- B,S,C,Sa', '4- B,S*,V,L',
       '2- B,C', '4- B,S*,C,V', '5- B,S,C,L,Sa'], dtype=object)

Here I can see that I can get quite a few more columns by splitting the ingredients column up!

In [542]:
df['ingredients'].isna().sum()

87

Only 87 NAs for ingredients, so I think we are pretty safe here with removing all those rows!

In [543]:
df.dropna(inplace=True)

In [544]:
df['ingredients'].isna().sum()

0

Success!

Now, lets start with making a total ingredients column.

In [545]:
df['total_ingredients'] = df['ingredients'].str[0]
df

Unnamed: 0,bad_chocolate,cocoa_percent,ingredients,most_memorable_characteristics,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin,total_ingredients
0,0,76.0,"3- B,S,C","rich cocoa, fatty, bready",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3
1,0,76.0,"3- B,S,C","cocoa, vegetal, savory",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2,0,76.0,"3- B,S,C","cocoa, blackberry, full body",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3
3,0,68.0,"3- B,S,C","chewy, off, rubbery",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3
4,0,72.0,"3- B,S,C","fatty, earthy, moss, nutty,chalky",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1,80.0,"4- B,S*,C,Sa","waxy, cloying, vegetal",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
2526,0,75.0,"3- B,S,C","strong nutty, marshmallow",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2527,0,75.0,"3- B,S,C","fatty, earthy, cocoa",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2528,0,70.0,"3- B,S,C","fatty, mild nuts, mild fruit",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3


In [546]:
df['total_ingredients'] = df['total_ingredients'].astype(int)

In [547]:
df['total_ingredients'].unique()

array([3, 4, 2, 5, 6, 1])

Looks good!

Now I can remove the first 3 characters from ingredients column so it only displays the ingredient names.

In [548]:
df['ingredients'] = df['ingredients'].str[3:]
df

Unnamed: 0,bad_chocolate,cocoa_percent,ingredients,most_memorable_characteristics,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin,total_ingredients
0,0,76.0,"B,S,C","rich cocoa, fatty, bready",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3
1,0,76.0,"B,S,C","cocoa, vegetal, savory",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2,0,76.0,"B,S,C","cocoa, blackberry, full body",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3
3,0,68.0,"B,S,C","chewy, off, rubbery",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3
4,0,72.0,"B,S,C","fatty, earthy, moss, nutty,chalky",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1,80.0,"B,S*,C,Sa","waxy, cloying, vegetal",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
2526,0,75.0,"B,S,C","strong nutty, marshmallow",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2527,0,75.0,"B,S,C","fatty, earthy, cocoa",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2528,0,70.0,"B,S,C","fatty, mild nuts, mild fruit",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3


Looks good! Now I will extract each ingredient into its seperate column as a dummy variable

In [549]:
df.ingredients.unique()

array(['B,S,C', 'B,S,C,L', 'B,S', 'B,S,C,V', 'B,S,C,V,L', ',S,C,V,L,Sa',
       ',S,C,V,Sa', 'B,S,V,L', 'B,S*', 'B', 'B,S*,C', 'B,S,L', 'B,S,V',
       'B,S*,C,L', 'B,S*,C,Sa', 'B,S*,Sa', 'B,S,C,Sa', 'B,S*,V,L', 'B,C',
       'B,S*,C,V', 'B,S,C,L,Sa'], dtype=object)

In [550]:
ingredients_dummy = df['ingredients'].str.get_dummies(sep=',').copy()

In [551]:
#Making all columns to lowercase
ingredients_dummy.columns= ingredients_dummy.columns.str.lower()

In [552]:
ingredients_dummy

Unnamed: 0,b,c,l,s,s*,sa,v
0,1,1,0,1,0,0,0
1,1,1,0,1,0,0,0
2,1,1,0,1,0,0,0
3,1,1,0,1,0,0,0
4,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...
2525,1,1,0,0,1,1,0
2526,1,1,0,1,0,0,0
2527,1,1,0,1,0,0,0
2528,1,1,0,1,0,0,0


Looks good!

Time to rename the ingredient characters to the right word.

B = Beans, S = Sugar, S* = Sweetener other than white cane or beet sugar, C = Cocoa Butter, V = Vanilla, L = Lecithin, Sa = Salt

In [553]:
ingredients_dummy.rename(columns={
    'b':'contains_beans',
    'c':'contains_cocoa_butter',
    'l':'contains_lecithin',
    's':'contains_sugar',
    's*':'contains_sweetener',
    'sa':'contains_salt',
    'v':'contains_vanilla'
    })

Unnamed: 0,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla
0,1,1,0,1,0,0,0
1,1,1,0,1,0,0,0
2,1,1,0,1,0,0,0
3,1,1,0,1,0,0,0
4,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...
2525,1,1,0,0,1,1,0
2526,1,1,0,1,0,0,0
2527,1,1,0,1,0,0,0
2528,1,1,0,1,0,0,0


Looks good!

In [554]:
ingredients_dummy.rename(columns={
    'b':'contains_beans',
    'c':'contains_cocoa_butter',
    'l':'contains_lecithin',
    's':'contains_sugar',
    's*':'contains_sweetener',
    'sa':'contains_salt',
    'v':'contains_vanilla'
    }, inplace=True)

In [555]:
df = df.join(ingredients_dummy)

In [556]:
df

Unnamed: 0,bad_chocolate,cocoa_percent,ingredients,most_memorable_characteristics,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla
0,0,76.0,"B,S,C","rich cocoa, fatty, bready",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,1,1,0,1,0,0,0
1,0,76.0,"B,S,C","cocoa, vegetal, savory",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2,0,76.0,"B,S,C","cocoa, blackberry, full body",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
3,0,68.0,"B,S,C","chewy, off, rubbery",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0
4,0,72.0,"B,S,C","fatty, earthy, moss, nutty,chalky",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1,80.0,"B,S*,C,Sa","waxy, cloying, vegetal",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,1,0,0,1,1,0
2526,0,75.0,"B,S,C","strong nutty, marshmallow",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2527,0,75.0,"B,S,C","fatty, earthy, cocoa",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2528,0,70.0,"B,S,C","fatty, mild nuts, mild fruit",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0


In [557]:
df.drop('ingredients', axis=1, inplace=True) #Dont need this anymore

In [558]:
df.dtypes

bad_chocolate                       int32
cocoa_percent                     float64
most_memorable_characteristics     object
belize                              uint8
blend                               uint8
                                   ...   
contains_lecithin                   int64
contains_sugar                      int64
contains_sweetener                  int64
contains_salt                       int64
contains_vanilla                    int64
Length: 34, dtype: object

Great! Now it's time to create dummy variables for all the most memorable characteristics.

In [559]:
characteristics_dummies = df['most_memorable_characteristics'].str.get_dummies(sep=',').copy()
characteristics_dummies.columns

Index([' "Andes" mint', ' Cadbury egg', ' Roasty', ' accessible', ' acidic',
       ' alcohol', ' almond', ' almond butter', ' almost burnt', ' alocohol',
       ...
       'waxy mouthfeel', 'well balanced', 'well defined', 'why bother', 'wine',
       'wood', 'woodsy', 'woody', 'yellow fruit', 'yogurt'],
      dtype='object', length=1201)

Just looking at this, it seems like quite a few characteristics explains the same thing, and some misspellings

I think I might need to try and find what the most popular words words are and only have dummy variables for that

In [560]:
words = pd.DataFrame(characteristics_dummies.transpose().index)
words.rename(columns={0: 'word'}, inplace=True)
words

Unnamed: 0,word
0,"""Andes"" mint"
1,Cadbury egg
2,Roasty
3,accessible
4,acidic
...,...
1196,wood
1197,woodsy
1198,woody
1199,yellow fruit


In [561]:
words = words['word'].str.strip() #removing whitespace
words

0       "Andes" mint
1        Cadbury egg
2             Roasty
3         accessible
4             acidic
            ...     
1196            wood
1197          woodsy
1198           woody
1199    yellow fruit
1200          yogurt
Name: word, Length: 1201, dtype: object

Creating a dataframe with all words seperated

In [562]:
words = pd.DataFrame(words)
seperated_words = ' '.join([i for i in words['word']]).split()
seperated_words = pd.DataFrame(seperated_words)
seperated_words.rename(columns={0: 'word'}, inplace=True)
seperated_words

Unnamed: 0,word
0,"""Andes"""
1,mint
2,Cadbury
3,egg
4,Roasty
...,...
2098,woodsy
2099,woody
2100,yellow
2101,fruit


In [563]:
seperated_words.value_counts().reset_index().head(40)

Unnamed: 0,word,0
0,mild,87
1,cocoa,57
2,fruit,51
3,burnt,36
4,off,36
...,...,...
35,sl.,12
36,choco,11
37,harsh,11
38,dark,11


I want to remove all the stopwords.

In [564]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\poker\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [565]:
# Import stopwords with nltk.
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [566]:
# Excluding stopwords with Python's list comprehension and pandas.DataFrame.apply.
no_stop_words = seperated_words['word'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)])
)

In [567]:
no_stop_words = pd.DataFrame(no_stop_words)
no_stop_words

Unnamed: 0,word
0,"""Andes"""
1,mint
2,Cadbury
3,egg
4,Roasty
...,...
2098,woodsy
2099,woody
2100,yellow
2101,fruit


In [568]:
no_stop_words.value_counts()

word     
             125
mild          87
cocoa         57
fruit         51
burnt         36
            ... 
developed      1
oreo           1
origin         1
overyly        1
punch          1
Length: 536, dtype: int64

Removing blank rows

In [569]:
no_stop_words.replace(r'^\s*$', np.nan, regex=True, inplace=True)
no_stop_words.value_counts()

word   
mild       87
cocoa      57
fruit      51
burnt      36
sour       35
           ..
oreo        1
origin      1
overyly     1
papaya      1
"Andes"     1
Length: 535, dtype: int64

Success!

Now I want to get the root words from the different words.

In [570]:
pd.set_option('display.max_rows', None)
top100_words = pd.DataFrame(no_stop_words.value_counts().reset_index().head(100)['word'].copy())
top100_words.values

array([['mild'],
       ['cocoa'],
       ['fruit'],
       ['burnt'],
       ['sour'],
       ['intense'],
       ['strong'],
       ['notes'],
       ['sweet'],
       ['berry'],
       ['bitter'],
       ['rich'],
       ['coffee'],
       ['roast'],
       ['slight'],
       ['tart'],
       ['spice'],
       ['flavor'],
       ['orange'],
       ['note'],
       ['roasted'],
       ['banana'],
       ['nutty'],
       ['smoke'],
       ['roasty'],
       ['late'],
       ['spicy'],
       ['black'],
       ['molasses'],
       ['floral'],
       ['brownie'],
       ['sl.'],
       ['candy'],
       ['choco'],
       ['harsh'],
       ['high'],
       ['dark'],
       ['caramel'],
       ['rubber'],
       ['base'],
       ['cherry'],
       ['red'],
       ['vanilla'],
       ['nibs'],
       ['earthy'],
       ['light'],
       ['pepper'],
       ['woody'],
       ['dominate'],
       ['nuts'],
       ['honey'],
       ['chocolate'],
       ['citrus'],
       ['licorice'],
      

Removing all non words and non root words

In [571]:
popular_words = ([['mild'],
       ['cocoa'],
       ['fruit'],
       ['burnt'],
       ['sour'],
       ['intense'],
       ['strong'],
       ['sweet'],
       ['berry'],
       ['bitter'],
       ['rich'],
       ['coffee'],
       ['roast'],
       ['tart'],
       ['spice'],
       ['orange'],
       ['banana'],
       ['nut'],
       ['smoke'],
       ['spicy'],
       ['molasses'],
       ['floral'],
       ['brownie'],
       ['candy'],
       ['choco'],
       ['harsh'],
       ['dark'],
       ['caramel'],
       ['rubber'],
       ['cherry'],
       ['vanilla'],
       ['nibs'],
       ['earthy'],
       ['light'],
       ['pepper'],
       ['wood'],
       ['honey'],
       ['citrus'],
       ['licorice'],
       ['metallic'],
       ['tobacco'],
       ['dried'],
       ['basic'],
       ['subtle'],
       ['green'],
       ['grapes'],
       ['strawberry'],
       ['dry'],
       ['lemon'],
       ['pungent'],
       ['chemical'],
       ['plum'],
       ['grits'],
       ['grape'],
       ['hot'],
       ['dairy'],
       ['mint'],
       ['ham'],
       ['sugar'],
       ['butter'],
       ['smoke'],
       ['mellow'],
       ['blueberry'],
       ['tropical'],
       ['raspberry'],
       ['malt'],
       ['fat'],
       ['melon'],
       ['tea'],
       ['milk']])

In [572]:
popular_words = pd.DataFrame(popular_words) #Turning it into a df

In [573]:
unique_popular_words = pd.DataFrame(popular_words[0].unique())

In [574]:
unique_popular_words

Unnamed: 0,0
0,mild
1,cocoa
2,fruit
3,burnt
4,sour
5,intense
6,strong
7,sweet
8,berry
9,bitter


Looks good! Now time get add the column names to the original df.

In [575]:
taste_dummy_step = pd.pivot(unique_popular_words, index=0, columns=0, values=0)

In [576]:
taste_dummies = pd.get_dummies(taste_dummy_step)

In [577]:
taste_dummes_list = taste_dummies.T.columns.to_list()

In [578]:
taste_dummes_list

['banana',
 'basic',
 'berry',
 'bitter',
 'blueberry',
 'brownie',
 'burnt',
 'butter',
 'candy',
 'caramel',
 'chemical',
 'cherry',
 'choco',
 'citrus',
 'cocoa',
 'coffee',
 'dairy',
 'dark',
 'dried',
 'dry',
 'earthy',
 'fat',
 'floral',
 'fruit',
 'grape',
 'grapes',
 'green',
 'grits',
 'ham',
 'harsh',
 'honey',
 'hot',
 'intense',
 'lemon',
 'licorice',
 'light',
 'malt',
 'mellow',
 'melon',
 'metallic',
 'mild',
 'milk',
 'mint',
 'molasses',
 'nibs',
 'nut',
 'orange',
 'pepper',
 'plum',
 'pungent',
 'raspberry',
 'rich',
 'roast',
 'rubber',
 'smoke',
 'sour',
 'spice',
 'spicy',
 'strawberry',
 'strong',
 'subtle',
 'sugar',
 'sweet',
 'tart',
 'tea',
 'tobacco',
 'tropical',
 'vanilla',
 'wood']

In [579]:
taste_columns = [word + "_taste" for word in taste_dummes_list]
taste_columns

['banana_taste',
 'basic_taste',
 'berry_taste',
 'bitter_taste',
 'blueberry_taste',
 'brownie_taste',
 'burnt_taste',
 'butter_taste',
 'candy_taste',
 'caramel_taste',
 'chemical_taste',
 'cherry_taste',
 'choco_taste',
 'citrus_taste',
 'cocoa_taste',
 'coffee_taste',
 'dairy_taste',
 'dark_taste',
 'dried_taste',
 'dry_taste',
 'earthy_taste',
 'fat_taste',
 'floral_taste',
 'fruit_taste',
 'grape_taste',
 'grapes_taste',
 'green_taste',
 'grits_taste',
 'ham_taste',
 'harsh_taste',
 'honey_taste',
 'hot_taste',
 'intense_taste',
 'lemon_taste',
 'licorice_taste',
 'light_taste',
 'malt_taste',
 'mellow_taste',
 'melon_taste',
 'metallic_taste',
 'mild_taste',
 'milk_taste',
 'mint_taste',
 'molasses_taste',
 'nibs_taste',
 'nut_taste',
 'orange_taste',
 'pepper_taste',
 'plum_taste',
 'pungent_taste',
 'raspberry_taste',
 'rich_taste',
 'roast_taste',
 'rubber_taste',
 'smoke_taste',
 'sour_taste',
 'spice_taste',
 'spicy_taste',
 'strawberry_taste',
 'strong_taste',
 'subtle_tas

Looks good!

In [580]:
pd.set_option('display.max_rows', 10)
df[taste_columns] = 0
df

Unnamed: 0,bad_chocolate,cocoa_percent,most_memorable_characteristics,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste
0,0,76.0,"rich cocoa, fatty, bready",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,76.0,"cocoa, vegetal, savory",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,76.0,"cocoa, blackberry, full body",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,68.0,"chewy, off, rubbery",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,72.0,"fatty, earthy, moss, nutty,chalky",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1,80.0,"waxy, cloying, vegetal",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0,75.0,"strong nutty, marshmallow",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2527,0,75.0,"fatty, earthy, cocoa",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0,70.0,"fatty, mild nuts, mild fruit",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Now time to fix the dummies!

In [581]:
def sentence_to_word_dummy(df, dummies):
    count = 0
    for dummy in dummies:
        df.loc[df['most_memorable_characteristics'].str.contains(dummy), taste_columns[count]] = 1
        count = count + 1

In [582]:
sentence_to_word_dummy(df, taste_dummes_list)
df

Unnamed: 0,bad_chocolate,cocoa_percent,most_memorable_characteristics,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste
0,0,76.0,"rich cocoa, fatty, bready",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,76.0,"cocoa, vegetal, savory",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,76.0,"cocoa, blackberry, full body",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,68.0,"chewy, off, rubbery",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,72.0,"fatty, earthy, moss, nutty,chalky",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1,80.0,"waxy, cloying, vegetal",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0,75.0,"strong nutty, marshmallow",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2527,0,75.0,"fatty, earthy, cocoa",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0,70.0,"fatty, mild nuts, mild fruit",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [583]:
df.describe()

Unnamed: 0,bad_chocolate,cocoa_percent,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste
count,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0,2443.0
mean,0.215309,71.496725,0.030291,0.058944,0.032337,0.031519,0.031928,0.017192,0.090053,0.082276,0.014736,0.025379,0.01228,0.014327,0.069996,0.022104,0.040933,0.020057,0.094556,0.031928,0.017192,0.012689,0.100696,0.029881,0.118707,3.041343,0.995907,0.682767,0.201801,0.966025,0.031109,0.015145,0.144494,0.024969,0.013508,0.060581,0.043389,0.004503,0.022513,0.028653,0.019648,0.009005,0.029063,0.009824,0.019648,0.017192,0.013099,0.166189,0.03684,0.020876,0.010643,0.031928,0.026197,0.077773,0.068359,0.065493,0.133442,0.017192,0.009005,0.011052,0.011052,0.020876,0.009824,0.016373,0.004093,0.072043,0.009005,0.015145,0.019239,0.003684,0.003275,0.013099,0.016373,0.087597,0.007777,0.009824,0.039705,0.004912,0.163733,0.013917,0.015555,0.003275,0.015145,0.006959,0.060172,0.119116,0.024969,0.046664,0.081867,0.044617,0.058125,0.013099,0.018829,0.004912,0.003275,0.124437,0.033975,0.008187,0.015964,0.002865,0.041343,0.043389
std,0.411121,5.156974,0.171421,0.235568,0.176931,0.17475,0.175844,0.130013,0.286317,0.274841,0.120519,0.157304,0.110155,0.118858,0.255192,0.147052,0.198176,0.140225,0.29266,0.175844,0.130013,0.111953,0.300987,0.170295,0.323509,0.913728,0.063861,0.465494,0.401427,0.181201,0.173649,0.122156,0.351662,0.156063,0.11546,0.238609,0.203774,0.066964,0.148376,0.166864,0.138816,0.094487,0.168017,0.098648,0.138816,0.130013,0.113721,0.372327,0.188407,0.142998,0.102634,0.175844,0.159754,0.267869,0.252412,0.247445,0.340122,0.130013,0.094487,0.104567,0.104567,0.142998,0.098648,0.126932,0.063861,0.258611,0.094487,0.122156,0.137391,0.060596,0.057143,0.113721,0.126932,0.282766,0.087864,0.098648,0.195306,0.069928,0.370109,0.117172,0.12377,0.057143,0.122156,0.083145,0.237854,0.323991,0.156063,0.210961,0.274217,0.206504,0.234028,0.113721,0.13595,0.069928,0.057143,0.330147,0.181201,0.090127,0.125362,0.053463,0.199122,0.203774
min,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,74.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Looks fantastic!

In [584]:
df.drop('most_memorable_characteristics', axis=1, inplace=True) #Dont need this anymore

In [585]:
df

Unnamed: 0,bad_chocolate,cocoa_percent,belize,blend,bolivia,brazil,colombia,costa rica,dominican republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua new guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste
0,0,76.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,76.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,76.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,68.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,72.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1,80.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0,75.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2527,0,75.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0,70.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Some countries have a space in their name, I'm gonna replace the spaces with an underscore.

In [586]:
# remove spaces in columns name
df.columns = df.columns.str.replace(' ','_')
df

Unnamed: 0,bad_chocolate,cocoa_percent,belize,blend,bolivia,brazil,colombia,costa_rica,dominican_republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua_new_guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,rare_bean_origin,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste
0,0,76.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,76.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,76.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,68.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,72.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1,80.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0,75.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2527,0,75.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0,70.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Success!

**Time for modeling**

In [587]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

I'm first gonna remove rare_bean_origin column, so it will be my reference variable.

In [588]:
model_df = df.copy()
model_df.drop('rare_bean_origin', axis=1, inplace=True)
model_df

Unnamed: 0,bad_chocolate,cocoa_percent,belize,blend,bolivia,brazil,colombia,costa_rica,dominican_republic,ecuador,ghana,guatemala,haiti,india,madagascar,mexico,nicaragua,papua_new_guinea,peru,tanzania,trinidad,u.s.a.,venezuela,vietnam,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste
0,0,76.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,76.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,76.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,68.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,72.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1,80.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,0,75.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2527,0,75.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,0,70.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [589]:
x=model_df.copy().drop(columns=['bad_chocolate']).values #turning it into numpy array for XGBoost!
y=model_df['bad_chocolate'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1337)

In [590]:
print('Train_x:',x_train.shape)
print('Train_y:',y_train.shape)
print('Test_x:',x_test.shape)
print('Test_y:',y_test.shape)

Train_x: (2198, 100)
Train_y: (2198,)
Test_x: (245, 100)
Test_y: (245,)


Logistic Regression

In [591]:
from sklearn.linear_model import LogisticRegression

model_le=LogisticRegression(random_state=1337,max_iter=10000)

In [592]:
#doing 10-fold cross validation
model_le.fit(x_train,y_train)
parameters = {'C':[1.4, 1.3]} #Testing the C parameter in gridsearchCV
model_le_grid = GridSearchCV(model_le, parameters,cv=10,verbose=1,n_jobs=-1).fit(x_train,y_train)
print('Best parameters:',model_le_grid.best_params_)

print('Logistic Regression Cross validation score:',model_le_grid.best_score_*100)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
Best parameters: {'C': 1.4}
Logistic Regression Cross validation score: 81.80074719800746


In [593]:
model_le=LogisticRegression(random_state=42,C=1.4) #C=1.4 gave the best result
model_le.fit(x_train,y_train)
print('Logistic Regression Train score:',model_le.score(x_train,y_train)*100)

Logistic Regression Train score: 83.80345768880801


Random forest

In [594]:
from sklearn.ensemble import RandomForestClassifier

model_rfc=RandomForestClassifier(n_jobs=-1, random_state=1337)
parameters = {'n_estimators':[50,60],'max_depth':[6,7]}
model_rfc_grid = GridSearchCV(model_rfc, parameters, cv=10, verbose=1, n_jobs=-1).fit(x_train,y_train)
print(model_rfc_grid.best_params_)

print('Random Forest Classifier Cross validation score:',model_rfc_grid.best_score_*100)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
{'max_depth': 7, 'n_estimators': 50}
Random Forest Classifier Cross validation score: 79.57222914072229


In [595]:
#Entering the best values for the tested parameters
model_rfc=RandomForestClassifier(n_jobs=-1, n_estimators=50, max_depth=6, random_state=1337)
model_rfc.fit(x_train,y_train)
print('Random Forest Classifier Train score:',model_rfc.score(x_train,y_train)*100)

Random Forest Classifier Train score: 80.16378525932666


XGBoost

Log loss, short for logarithmic loss is a loss function for classification that quantifies the price paid for the inaccuracy of predictions in classification problems. Log loss penalizes false classifications by taking into account the probability of classification.

It's a pretty good evaluation metric for binary classifiers, gonna use it for my xgboost model

In [596]:
from xgboost import XGBClassifier
model_xgb=XGBClassifier(n_jobs=-1,random_state=1337,eval_metric='logloss')
parameters = {'n_estimators':[300,400],'max_depth':[3,4],'learning_rate':[0.1,0.08]}
model_xgb_grid = GridSearchCV(model_xgb, parameters,cv=10,verbose=1).fit(x_train,y_train)
print(model_xgb_grid.best_params_)

print('XGB Classifier Cross validation score:',model_xgb_grid.best_score_*100)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
XGB Classifier Cross validation score: 82.4846409298464


In [597]:
#Entering the winning parameters
model_xgb=XGBClassifier(
    n_jobs=-1, random_state=1337, n_estimators=300, max_depth=3, learning_rate=0.1, eval_metric='logloss'
)
model_xgb.fit(x_train,y_train)
print('XGB Classifier Train score:',model_xgb.score(x_train,y_train)*100)

XGB Classifier Train score: 86.03275705186533


XGBoost seems to be winning!

Lets see how the models do on the test data.

In [598]:
from sklearn.metrics import confusion_matrix

In [599]:
#Logistic regression
y_pred=model_le.predict(x_test)

print(confusion_matrix(y_test,y_pred)) 
#This result below means: 
#186 true negatives
#8 false negatives
#31 false positives
#20 true positives

[[186   8]
 [ 31  20]]


In [600]:
#Random forest
y_pred=model_rfc.predict(x_test)

print(confusion_matrix(y_test,y_pred)) 
#This result below means: 
#194 true negatives
#0 false negatives
#48 false positives
#3 true positives

[[194   0]
 [ 48   3]]


In [601]:
#XGBoost
y_pred=model_xgb.predict(x_test)

print(confusion_matrix(y_test,y_pred)) 
#This result below means: 
#187 true negatives
#7 false negatives
#31 false positives
#20 true positives

[[187   7]
 [ 31  20]]


In [602]:
from sklearn.metrics import classification_report

def test_score(model_name):
  for i in model_name: 
    print(f'{i.__class__} \n{classification_report(y_test,i.predict(x_test))}')

In [603]:
models=[model_le,model_rfc,model_xgb]
test_score(models)

<class 'sklearn.linear_model._logistic.LogisticRegression'> 
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       194
           1       0.71      0.39      0.51        51

    accuracy                           0.84       245
   macro avg       0.79      0.68      0.71       245
weighted avg       0.83      0.84      0.82       245

<class 'sklearn.ensemble._forest.RandomForestClassifier'> 
              precision    recall  f1-score   support

           0       0.80      1.00      0.89       194
           1       1.00      0.06      0.11        51

    accuracy                           0.80       245
   macro avg       0.90      0.53      0.50       245
weighted avg       0.84      0.80      0.73       245

<class 'xgboost.sklearn.XGBClassifier'> 
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       194
           1       0.74      0.39      0.51        51

    accuracy         

XGBoost is slightly better than Logistic Regression and Randomforest is the worst.

In [604]:
model_xgb.score(x_test,y_test)*100

84.48979591836735

On the test set XGBoost model has a mean accuracy of ~85% which is better than 75% which would be the accuracy if we always tried to predict that every chocolate would not be bad.

In [605]:
print(classification_report(y_test,model_xgb.predict(x_test)))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91       194
           1       0.74      0.39      0.51        51

    accuracy                           0.84       245
   macro avg       0.80      0.68      0.71       245
weighted avg       0.83      0.84      0.83       245



Copying the above classification_report to excel to prepare it for my tableau dashboard

**Feature importance**

Lets take a look which columns XGBoost thinks are the most important.

In [606]:
feature_important = model_xgb.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

column_importance = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

In [607]:
column_importance = column_importance.reset_index()
column_importance.rename(columns={'index' : 'column_index', 'score' : 'xgboost_score'}, inplace=True)
column_importance

Unnamed: 0,column_index,xgboost_score
0,f0,243.0
1,f23,92.0
2,f30,35.0
3,f26,33.0
4,f83,32.0
...,...,...
74,f35,2.0
75,f85,1.0
76,f90,1.0
77,f79,1.0


In [608]:
column_importance['column_index'] = column_importance['column_index'].str.replace('f', '')
column_importance['column_index']= column_importance['column_index'].astype(int)
column_importance.dtypes

column_index       int32
xgboost_score    float64
dtype: object

In [609]:
column_importance

Unnamed: 0,column_index,xgboost_score
0,0,243.0
1,23,92.0
2,30,35.0
3,26,33.0
4,83,32.0
...,...,...
74,35,2.0
75,85,1.0
76,90,1.0
77,79,1.0


In [610]:
colume_names = df.copy()
colume_names.drop('bad_chocolate', axis=1, inplace=True) #I do this so the data frame matches with X_test
colume_names.drop('rare_bean_origin', axis=1, inplace=True) #I do this so the data frame matches with X_test
colume_names.columns[0]

'cocoa_percent'

In [611]:
column_importance['column_name'] = colume_names.columns[column_importance['column_index']]

In [612]:
column_importance.drop('column_index', axis=1, inplace=True)

In [613]:
column_importance = column_importance[['column_name', 'xgboost_score']]
column_importance

Unnamed: 0,column_name,xgboost_score
0,cocoa_percent,243.0
1,total_ingredients,92.0
2,contains_vanilla,35.0
3,contains_lecithin,33.0
4,roast_taste,32.0
...,...,...
74,blueberry_taste,2.0
75,smoke_taste,1.0
76,strong_taste,1.0
77,plum_taste,1.0


In [614]:
pd.set_option('display.max_rows', 20)
column_importance.head(20)

Unnamed: 0,column_name,xgboost_score
0,cocoa_percent,243.0
1,total_ingredients,92.0
2,contains_vanilla,35.0
3,contains_lecithin,33.0
4,roast_taste,32.0
5,cocoa_taste,31.0
6,peru,31.0
7,pungent_taste,30.0
8,chemical_taste,30.0
9,harsh_taste,30.0


Time to see which of these columns have a negative or positive correlation with if the chocolate will taste bad or not.

In [615]:
top20_score_columns = column_importance['column_name'].head(20)
top20_score_columns = top20_score_columns.to_list()
top20_score_columns.insert(0, 'bad_chocolate')
top20_score_columns

['bad_chocolate',
 'cocoa_percent',
 'total_ingredients',
 'contains_vanilla',
 'contains_lecithin',
 'roast_taste',
 'cocoa_taste',
 'peru',
 'pungent_taste',
 'chemical_taste',
 'harsh_taste',
 'nut_taste',
 'cherry_taste',
 'burnt_taste',
 'vanilla_taste',
 'bitter_taste',
 'fruit_taste',
 'grape_taste',
 'rich_taste',
 'rubber_taste',
 'sweet_taste']

In [616]:
pd.set_option('display.max_rows', 21)
#Using spearman correlation to be risk averse
column_names_w_corr = df[top20_score_columns].corr(method='spearman').iloc[0,].reset_index().rename(
    columns={'index':'column_name', 'bad_chocolate' : 'bad_chocolate_corr'}
).iloc[1: , :]

column_names_w_corr

Unnamed: 0,column_name,bad_chocolate_corr
1,cocoa_percent,0.076407
2,total_ingredients,0.058016
3,contains_vanilla,0.124615
4,contains_lecithin,0.04926
5,roast_taste,-0.048129
6,cocoa_taste,-0.148249
7,peru,0.021318
8,pungent_taste,0.138892
9,chemical_taste,0.180057
10,harsh_taste,0.129572


Time two merge the two dataframes into one

In [617]:
score_and_corr = pd.merge(column_importance, column_names_w_corr, how='inner', on='column_name')
score_and_corr

Unnamed: 0,column_name,xgboost_score,bad_chocolate_corr
0,cocoa_percent,243.0,0.076407
1,total_ingredients,92.0,0.058016
2,contains_vanilla,35.0,0.124615
3,contains_lecithin,33.0,0.04926
4,roast_taste,32.0,-0.048129
5,cocoa_taste,31.0,-0.148249
6,peru,31.0,0.021318
7,pungent_taste,30.0,0.138892
8,chemical_taste,30.0,0.180057
9,harsh_taste,30.0,0.129572


Now making it a csv file so I can present my findings there in tableau.

In [618]:
score_and_corr.to_csv('score_and_correlation.csv')

**Success! Now I want to create one good looking dataframe that I can create a database from, so I can do some data analysis using SQL aswell!**

In [619]:
sql_df = pd.read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv'
)

In [620]:
sql_df

Unnamed: 0,ref,company_manufacturer,company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,ingredients,most_memorable_characteristics,rating
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.50
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.00
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.00
...,...,...,...,...,...,...,...,...,...,...
2525,1205,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa","waxy, cloying, vegetal",2.75
2526,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C","strong nutty, marshmallow",3.75
2527,2036,Zotter,Austria,2018,Blend,"Dry Aged, 30 yr Anniversary bar",75%,"3- B,S,C","fatty, earthy, cocoa",3.00
2528,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C","fatty, mild nuts, mild fruit",3.25


Time to create dummy variables out of what it tastes like, gonna re use old function that I did earlier.

In [621]:
sql_df[taste_columns] = 0
sentence_to_word_dummy(sql_df, taste_dummes_list)
sql_df.drop('most_memorable_characteristics', axis=1, inplace=True) #Dont need this anymore
sql_df.drop('ref', axis=1, inplace=True) #Dont want this
sql_df

Unnamed: 0,company_manufacturer,company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,ingredients,rating,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste
0,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C",3.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C",3.50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C",3.75,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C",3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C",3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa",2.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2526,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C",3.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2527,Zotter,Austria,2018,Blend,"Dry Aged, 30 yr Anniversary bar",75%,"3- B,S,C",3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2528,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C",3.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Now redoing how I did with the ingredients.

In [622]:
sql_df['total_ingredients'] = sql_df['ingredients'].str[0]
sql_df['ingredients'] = sql_df['ingredients'].str[3:]
sql_ingredients_dummy = sql_df['ingredients'].str.get_dummies(sep=',').copy()
sql_ingredients_dummy.columns= sql_ingredients_dummy.columns.str.lower()
sql_df = sql_df.join(sql_ingredients_dummy)
sql_df.drop('ingredients', axis=1, inplace=True) #Dont need this anymore
sql_df = sql_df.rename(columns={
    'b':'contains_beans',
    'c':'contains_cocoa_butter',
    'l':'contains_lecithin',
    's':'contains_sugar',
    's*':'contains_sweetener',
    'sa':'contains_salt',
    'v':'contains_vanilla'
    }
)
sql_df

Unnamed: 0,company_manufacturer,company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,rating,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla
0,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,3.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
1,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,3.50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,3.75,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
3,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
4,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,Zotter,Austria,2014,Blend,Raw,80%,2.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,1,0,0,1,1,0
2526,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,3.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2527,Zotter,Austria,2018,Blend,"Dry Aged, 30 yr Anniversary bar",75%,3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2528,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,3.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0


Looks good! Now lastly im gonna remove the "%" sign from cocoa percent column.

In [623]:
sql_df['cocoa_percent'] = sql_df['cocoa_percent'].str.replace('%', '').astype('float')
sql_df

Unnamed: 0,company_manufacturer,company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,rating,banana_taste,basic_taste,berry_taste,bitter_taste,blueberry_taste,brownie_taste,burnt_taste,butter_taste,candy_taste,caramel_taste,chemical_taste,cherry_taste,choco_taste,citrus_taste,cocoa_taste,coffee_taste,dairy_taste,dark_taste,dried_taste,dry_taste,earthy_taste,fat_taste,floral_taste,fruit_taste,grape_taste,grapes_taste,green_taste,grits_taste,ham_taste,harsh_taste,honey_taste,hot_taste,intense_taste,lemon_taste,licorice_taste,light_taste,malt_taste,mellow_taste,melon_taste,metallic_taste,mild_taste,milk_taste,mint_taste,molasses_taste,nibs_taste,nut_taste,orange_taste,pepper_taste,plum_taste,pungent_taste,raspberry_taste,rich_taste,roast_taste,rubber_taste,smoke_taste,sour_taste,spice_taste,spicy_taste,strawberry_taste,strong_taste,subtle_taste,sugar_taste,sweet_taste,tart_taste,tea_taste,tobacco_taste,tropical_taste,vanilla_taste,wood_taste,total_ingredients,contains_beans,contains_cocoa_butter,contains_lecithin,contains_sugar,contains_sweetener,contains_salt,contains_vanilla
0,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,3.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
1,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76.0,3.50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76.0,3.75,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
3,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68.0,3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
4,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72.0,3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,Zotter,Austria,2014,Blend,Raw,80.0,2.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,1,0,0,1,1,0
2526,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75.0,3.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2527,Zotter,Austria,2018,Blend,"Dry Aged, 30 yr Anniversary bar",75.0,3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0
2528,Zotter,Austria,2018,Congo,Mountains of the Moon,70.0,3.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1,0,1,0,0,0


Success!

In [624]:
sql_df.to_csv('sql_df.csv')