In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

%matplotlib inline

In [2]:
choc = pd.read_csv('flavors_of_cacao.csv')
choc.shape

(1795, 9)

In [3]:
choc.head(2)

Unnamed: 0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo


In [4]:
# Do a little reworking of column names so they're less unwieldy
orig_cols = choc.columns
new_cols = ['maker', 'orig_of_bar', 'REF', 'review_date', 'perc_cocoa',
            'maker_loc', 'rating', 'bean_type', 'orig_of_bean']
choc = choc.rename(columns=dict(zip(orig_cols, new_cols)))
choc.head(2)

Unnamed: 0,maker,orig_of_bar,REF,review_date,perc_cocoa,maker_loc,rating,bean_type,orig_of_bean
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo


In [5]:
# much of this will be NaN, but much is usable too
choc.describe(include='all')

Unnamed: 0,maker,orig_of_bar,REF,review_date,perc_cocoa,maker_loc,rating,bean_type,orig_of_bean
count,1795,1795,1795.0,1795.0,1795,1795,1795.0,1794.0,1794
unique,416,1039,,,45,60,,41.0,100
top,Soma,Madagascar,,,70%,U.S.A.,,,Venezuela
freq,47,57,,,672,764,,887.0,214
mean,,,1035.904735,2012.325348,,,3.185933,,
std,,,552.886365,2.92721,,,0.478062,,
min,,,5.0,2006.0,,,1.0,,
25%,,,576.0,2010.0,,,2.875,,
50%,,,1069.0,2013.0,,,3.25,,
75%,,,1502.0,2015.0,,,3.5,,


## One thing that I see is that I have single missing values in both bean type and origin
Other interesting things like 70% being most frequent, or Venezuela being the top bean origin.  Need to deal with the NaN's in type and origin.

In [6]:
# Looks like this is one of the top makers, and a good rating.  Going to fill it in.
choc["bean_type"] = choc["bean_type"].fillna("Blend")
choc[choc["bean_type"].isnull()]                # there, got it

Unnamed: 0,maker,orig_of_bar,REF,review_date,perc_cocoa,maker_loc,rating,bean_type,orig_of_bean


In [7]:
# where's the NaN in origin?
#choc[choc["orig_of_bean"].isnull()]
#choc.loc[(choc["maker"]=="Mast Brothers")]

Ok, the only NaN in the 'maker' column is from Mast Brothers, and it's Trinitario; their other Trinitarios all originate in Venezuela, so I'm replacing the NaN with Venezuela

In [8]:
choc["orig_of_bean"] = choc["orig_of_bean"].fillna("Venezuela")
choc[choc["orig_of_bean"].isnull()]              # there, that worked

Unnamed: 0,maker,orig_of_bar,REF,review_date,perc_cocoa,maker_loc,rating,bean_type,orig_of_bean


In [9]:
# Let's see what ratings of 4 or better look like
choc.loc[choc.rating >= 4.0].nunique()

maker           45
orig_of_bar     87
REF             75
review_date     11
perc_cocoa      17
maker_loc       17
rating           2
bean_type       15
orig_of_bean    31
dtype: int64

# A question is already forming: 
### What factors are common to the best rated bars, such as percent of cocoa or bean type (or origin, etc)?  How can these be tweaked to aim for a rating of 4 or better?
However, I'd better check into the need to do data cleaning.

In [10]:
# I don't think I'll mess with these, they look ok
choc["maker"].unique()

array(['A. Morin', 'Acalli', 'Adi', 'Aequare (Gianduja)', 'Ah Cacao',
       "Akesson's (Pralus)", 'Alain Ducasse', 'Alexandre',
       'Altus aka Cao Artisan', 'Amano', 'Amatller (Simon Coll)',
       'Amazona', 'Ambrosia', 'Amedei', 'AMMA', 'Anahata', 'Animas',
       'Ara', 'Arete', 'Artisan du Chocolat',
       'Artisan du Chocolat (Casa Luker)', 'Askinosie', 'Bahen & Co.',
       'Bakau', 'Bar Au Chocolat', "Baravelli's", 'Batch', 'Beau Cacao',
       'Beehive', 'Belcolade', 'Bellflower', 'Belyzium', 'Benoit Nihant',
       'Bernachon', 'Beschle (Felchlin)', 'Bisou', 'Bittersweet Origins',
       'Black Mountain', 'Black River (A. Morin)', 'Blanxart',
       'Blue Bandana', 'Bonnat', 'Bouga Cacao (Tulicorp)', 'Bowler Man',
       "Brasstown aka It's Chocolate", 'Brazen', 'Breeze Mill', 'Bright',
       'Britarev', 'Bronx Grrl Chocolate', 'Burnt Fork Bend',
       'Cacao Arabuco', 'Cacao Atlanta', 'Cacao Barry', 'Cacao de Origen',
       'Cacao de Origin', 'Cacao Hunters', 'Cacao M

The cell below has a lot of commented out lines; I had to look at a few different things, and rather than displaying the output for each, I just used commenting out.  

It looks like the vast majority of the empty 'orig_of_bean' fields are blends, so I'm going to fill them all in as 'Various'; I think I'll also fill in empty 'bean_type' fields all as 'Blend' too.

In [11]:
# We have a LOT of empty fields here, 887 of them
#print(choc["bean_type"].value_counts())
#print(choc["orig_of_bean"].value_counts())
#print(choc["orig_of_bean"].sort_values().unique())
choc.loc[choc["orig_of_bean"].str.len() == 1].head(20)  # there are actually 73 of these, remove head() to see

Unnamed: 0,maker,orig_of_bar,REF,review_date,perc_cocoa,maker_loc,rating,bean_type,orig_of_bean
77,Amedei,Nine,111,2007,75%,Italy,4.0,Blend,
85,Amedei,Toscano Black,170,2007,63%,Italy,3.5,Blend,
86,Amedei,Toscano Black,40,2006,70%,Italy,5.0,Blend,
87,Amedei,Toscano Black,75,2006,66%,Italy,4.0,Blend,
144,Bahen & Co.,Houseblend,1474,2015,80%,Australia,3.25,,
148,Bahen & Co.,Houseblend,999,2012,70%,Australia,2.5,Blend,
182,Bernachon,Nature,797,2012,55%,France,2.75,,
214,Blanxart,Organic Dark,322,2009,72%,Spain,2.75,,
245,Bonnat,One Hundred,81,2006,100%,France,1.5,,
277,Burnt Fork Bend,Blend,1303,2014,72%,U.S.A.,3.25,Blend,


In [12]:
choc.loc[choc["orig_of_bean"].str.len() == 1] = "VARIOUS"
choc["orig_of_bean"].sort_values().value_counts().head(7)

Venezuela             215
Ecuador               193
Peru                  165
Madagascar            145
Dominican Republic    141
VARIOUS                73
Nicaragua              60
Name: orig_of_bean, dtype: int64

In [13]:
# there's a variety of duplicates and other corrections here; I'm choosing to combine the various subtypes
# within a larger type into the single larger type ie: Forastero (Nacional) becomes just Forastero
choc.loc[choc["bean_type"].str.len() == 1] = "Blend"
choc.loc[choc["bean_type"] == "Trinitario, Criollo"] = "Criollo, Trinitario"
choc["bean_type"].sort_values().value_counts().head(20)

Blend                     858
Trinitario                418
Criollo                   152
Forastero                  87
VARIOUS                    73
Forastero (Nacional)       52
Criollo, Trinitario        48
Forastero (Arriba)         37
Criollo (Porcelana)        10
Forastero (Parazinho)       8
Forastero (Arriba) ASS      6
EET                         3
Nacional (Arriba)           3
Beniano                     3
Matina                      3
Trinitario, Forastero       2
Forastero (Catongo)         2
Criollo (Ocumare 61)        2
Criollo, Forastero          2
Nacional                    2
Name: bean_type, dtype: int64