In [197]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [198]:
chocolate_data = pd.read_csv("data/flavors_of_cacao.csv")
chocolate_data

Unnamed: 0,Company \n(Maker-if known),Specific Bean Origin\nor Bar Name,REF,Review\nDate,Cocoa\nPercent,Company\nLocation,Rating,Bean\nType,Broad Bean\nOrigin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.00,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.50,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.50,,Peru
...,...,...,...,...,...,...,...,...,...
1790,Zotter,Peru,647,2011,70%,Austria,3.75,,Peru
1791,Zotter,Congo,749,2011,65%,Austria,3.00,Forastero,Congo
1792,Zotter,Kerala State,749,2011,65%,Austria,3.50,Forastero,India
1793,Zotter,Kerala State,781,2011,62%,Austria,3.25,,India


In [199]:
#Improve Column Name Formatting
def name_cleaner(old_name):
    new_name = old_name.replace("\n", " ")
    return new_name

try:
    chocolate_data.rename(name_cleaner, axis=1, inplace=True)
except:
    print("Columns already renamed.")


In [200]:
chocolate_data.head()

Unnamed: 0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [201]:
renaming_table = {
    "Company\xa0 (Maker-if known)": "Company",
    "Specific Bean Origin or Bar Name": "Bean_origin_bar_name",
    "REF": "Ref",
    "Review Date": "Review_date",
    "Cocoa Percent": "Cocoa_percent",
    "Company Location": "Company_location",
    "Rating": "Rating",
    "Bean Type": "Bean_type",
    "Broad Bean Origin": "Broad_bean_origin"
}

try:
    chocolate_data.rename(renaming_table, axis=1, inplace=True)
except:
    print()

In [202]:
chocolate_data.head()

Unnamed: 0,Company,Bean_origin_bar_name,Ref,Review_date,Cocoa_percent,Company_location,Rating,Bean_type,Broad_bean_origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [203]:
chocolate_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795 entries, 0 to 1794
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Company               1795 non-null   object 
 1   Bean_origin_bar_name  1795 non-null   object 
 2   Ref                   1795 non-null   int64  
 3   Review_date           1795 non-null   int64  
 4   Cocoa_percent         1795 non-null   object 
 5   Company_location      1795 non-null   object 
 6   Rating                1795 non-null   float64
 7   Bean_type             1794 non-null   object 
 8   Broad_bean_origin     1794 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 126.3+ KB


In [204]:
chocolate_data.isna().sum()

Company                 0
Bean_origin_bar_name    0
Ref                     0
Review_date             0
Cocoa_percent           0
Company_location        0
Rating                  0
Bean_type               1
Broad_bean_origin       1
dtype: int64

In [205]:
try:
    chocolate_data.dropna(inplace=True)
except:
    print()

In [206]:
chocolate_data.head()

Unnamed: 0,Company,Bean_origin_bar_name,Ref,Review_date,Cocoa_percent,Company_location,Rating,Bean_type,Broad_bean_origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [207]:
chocolate_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1793 entries, 0 to 1794
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Company               1793 non-null   object 
 1   Bean_origin_bar_name  1793 non-null   object 
 2   Ref                   1793 non-null   int64  
 3   Review_date           1793 non-null   int64  
 4   Cocoa_percent         1793 non-null   object 
 5   Company_location      1793 non-null   object 
 6   Rating                1793 non-null   float64
 7   Bean_type             1793 non-null   object 
 8   Broad_bean_origin     1793 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 140.1+ KB


In [208]:
company_location_unique = chocolate_data['Company_location'].unique()
bean_type_unique = chocolate_data['Bean_type'].unique()
broad_bean_origin_unique = chocolate_data['Broad_bean_origin'].unique()

(company_location_unique, bean_type_unique, broad_bean_origin_unique)

(array(['France', 'U.S.A.', 'Fiji', 'Ecuador', 'Mexico', 'Switzerland',
        'Netherlands', 'Spain', 'Peru', 'Canada', 'Italy', 'Brazil',
        'U.K.', 'Australia', 'Wales', 'Belgium', 'Germany', 'Russia',
        'Puerto Rico', 'Venezuela', 'Colombia', 'Japan', 'New Zealand',
        'Costa Rica', 'South Korea', 'Amsterdam', 'Scotland', 'Martinique',
        'Sao Tome', 'Argentina', 'Guatemala', 'South Africa', 'Bolivia',
        'St. Lucia', 'Portugal', 'Singapore', 'Denmark', 'Vietnam',
        'Grenada', 'Israel', 'India', 'Czech Republic',
        'Domincan Republic', 'Finland', 'Madagascar', 'Philippines',
        'Sweden', 'Poland', 'Austria', 'Honduras', 'Nicaragua',
        'Lithuania', 'Niacragua', 'Chile', 'Ghana', 'Iceland', 'Eucador',
        'Hungary', 'Suriname', 'Ireland'], dtype=object),
 array(['\xa0', 'Criollo', 'Trinitario', 'Forastero (Arriba)', 'Forastero',
        'Forastero (Nacional)', 'Criollo, Trinitario',
        'Criollo (Porcelana)', 'Blend', 'Trinita

In [212]:
chocolate_data.head()


Unnamed: 0,Company,Bean_origin_bar_name,Ref,Review_date,Cocoa_percent,Company_location,Rating,Bean_type,Broad_bean_origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [214]:
chocolate_data['Cocoa_percent'] = chocolate_data['Cocoa_percent'].str.replace('%', '').astype(float)

In [215]:
chocolate_data.head()

Unnamed: 0,Company,Bean_origin_bar_name,Ref,Review_date,Cocoa_percent,Company_location,Rating,Bean_type,Broad_bean_origin
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70.0,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,,Peru
