# Imports 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import plotly.plotly as py
import cufflinks as cf
cf.go_offline()
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import math
from IPython.display import display, HTML

## Import Dataset

In [2]:
cookies = pd.read_csv('data/cookies_target.csv')

### Categorical

Butter type

Mixings

In [3]:
for i in ['butter type', 'mixins']:
    print(i)
    display(cookies[i].value_counts())

butter type


melted    3813
cubed     1088
Name: butter type, dtype: int64

mixins


chocolate                         1847
raisins                           1135
chocolate, oats                    729
nuts, chocolate                    476
nuts,raisins                       292
nuts, oats, chocolate              254
nuts, oats                          86
chocolate, peanut butter            45
raisins, oats                       19
peanut butter                       12
oats                                 4
chocolate, oats, peanut butter       2
Name: mixins, dtype: int64

There are a few different values but all of them are a combination of the following elements:
* raisins
* nuts
* chocolate
* oats
* peanut butter

We will deal with this later in the feature engineering part

### create dummies from 'mixins'

In [4]:
order = ['chocolate', 'raisins', 'oats', 'nuts', 'peanut butter' ]
def expanse(val):
    new = ['0','0','0','0','0']
    are = [i.strip() for i in val.split(',')]
    ret=''
    for i in are:
        o = order.index(i)
        new[o]=1
    for i,f in enumerate(new):
        ret += str(f)
        if i < len(new)-1:
            ret += ','
    return ret

In [5]:
cookies.mixins = cookies.mixins.apply(expanse)

In [6]:
df1 = cookies["mixins"].str.split(",", n = 4, expand = True)
df1.columns = order

df1.head()


Unnamed: 0,chocolate,raisins,oats,nuts,peanut butter
0,0,1,0,0,0
1,0,1,0,0,0
2,1,0,0,1,0
3,1,0,0,0,0
4,1,0,1,1,0


In [7]:

for f in order:
    df1[f]=df1[f].astype('int64', copy=False)

In [8]:
cookies = pd.merge(cookies, df1, left_index=True, right_index=True)

In [9]:
cookies.drop(['mixins'], axis=1,inplace=True)

In [10]:
cookies.head()

Unnamed: 0.1,Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,crunch factor,aesthetic appeal,chocolate,raisins,oats,nuts,peanut butter
0,0,0.25,9.5,300,15.0,136.0,0.99367,8.1,0.44,12.1,1,melted,15.2,1.3,3,0,1,0,0,0
1,1,0.23,3.3,520,34.0,113.0,0.99429,8.16,0.48,8.4,0,melted,12.4,1.71,3,0,1,0,0,0
2,2,0.18,1.9,360,33.0,106.0,0.98746,8.21,0.83,14.0,1,melted,9.4,1.78,3,1,0,0,1,0
3,3,0.18,10.5,490,41.0,124.0,0.9963,8.14,0.35,10.5,0,melted,12.2,1.59,3,1,0,0,0,0
4,4,0.24,2.4,770,6.0,33.0,0.9974,8.09,0.57,9.4,0,cubed,19.8,1.3,3,1,0,1,1,0


In [11]:
cookies.drop(['Unnamed: 0'], axis=1,inplace=True)

In [12]:
cookies.drop(['chocolate'], axis=1,inplace=True)

In [13]:
cookies.head()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,crunch factor,aesthetic appeal,raisins,oats,nuts,peanut butter
0,0.25,9.5,300,15.0,136.0,0.99367,8.1,0.44,12.1,1,melted,15.2,1.3,3,1,0,0,0
1,0.23,3.3,520,34.0,113.0,0.99429,8.16,0.48,8.4,0,melted,12.4,1.71,3,1,0,0,0
2,0.18,1.9,360,33.0,106.0,0.98746,8.21,0.83,14.0,1,melted,9.4,1.78,3,0,0,1,0
3,0.18,10.5,490,41.0,124.0,0.9963,8.14,0.35,10.5,0,melted,12.2,1.59,3,0,0,0,0
4,0.24,2.4,770,6.0,33.0,0.9974,8.09,0.57,9.4,0,cubed,19.8,1.3,3,0,1,1,0


In [14]:
butter_type = pd.get_dummies(cookies['butter type'])

In [15]:
cookies.drop(['butter type'], axis=1,inplace=True)

In [16]:
butter_type.head()

Unnamed: 0,cubed,melted
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [17]:
cookies = pd.merge(cookies, butter_type, left_index=True, right_index=True)

In [18]:
cookies.head()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,weight,crunch factor,aesthetic appeal,raisins,oats,nuts,peanut butter,cubed,melted
0,0.25,9.5,300,15.0,136.0,0.99367,8.1,0.44,12.1,1,15.2,1.3,3,1,0,0,0,0,1
1,0.23,3.3,520,34.0,113.0,0.99429,8.16,0.48,8.4,0,12.4,1.71,3,1,0,0,0,0,1
2,0.18,1.9,360,33.0,106.0,0.98746,8.21,0.83,14.0,1,9.4,1.78,3,0,0,1,0,0,1
3,0.18,10.5,490,41.0,124.0,0.9963,8.14,0.35,10.5,0,12.2,1.59,3,0,0,0,0,0,1
4,0.24,2.4,770,6.0,33.0,0.9974,8.09,0.57,9.4,0,19.8,1.3,3,0,1,1,0,1,0


## Export CSV

In [19]:

cookies.to_csv('data/cookies_dummies.csv',index=False)

In [20]:
#cookiest = pd.read_csv('data/cookies_dummies.csv')

In [21]:
#cookiest.head()