In [8]:
import pandas as pd
import os
import sys
src_dir = os.path.join(os.getcwd(), '..', '..', '03-src')
sys.path.append(src_dir)

import decisionclass.decision_functions as hmd

from sklearn.preprocessing import MinMaxScaler

# from itertools import combinations
%matplotlib inline

In [2]:
cereal_df = pd.read_csv('../../01-data/01-raw/grocery-store/cereal.csv')

In [16]:
cereal_df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,net_carbs
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,-5.0
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679,6.0
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,-2.0
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912,-6.0
5,Apple Cinnamon Cheerios,G,C,110,2,2,180,1.5,10.5,10,70,25,1,1.0,0.75,29.509541,9.0


In [3]:
cereal_df.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,106.883117,2.545455,1.012987,159.675325,2.151948,14.597403,6.922078,96.077922,28.246753,2.207792,1.02961,0.821039,42.665705
std,19.484119,1.09479,1.006473,83.832295,2.383364,4.278956,4.444885,71.286813,22.342523,0.832524,0.150477,0.232716,14.047289
min,50.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,1.0,0.5,0.25,18.042851
25%,100.0,2.0,0.0,130.0,1.0,12.0,3.0,40.0,25.0,1.0,1.0,0.67,33.174094
50%,110.0,3.0,1.0,180.0,2.0,14.0,7.0,90.0,25.0,2.0,1.0,0.75,40.400208
75%,110.0,3.0,2.0,210.0,3.0,17.0,11.0,120.0,25.0,3.0,1.0,1.0,50.828392
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0,100.0,3.0,1.5,1.5,93.704912


---
---

## Clean the data

Can we interpret the null values (-1)?

In [4]:
cereal_df.loc[(cereal_df.carbo==-1) | (cereal_df.sugars==-1) | (cereal_df.potass==-1)]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
20,Cream of Wheat (Quick),N,H,100,3,0,80,1.0,21.0,0,-1,0,2,1.0,1.0,64.533816
57,Quaker Oatmeal,Q,H,100,5,2,0,2.7,-1.0,-1,110,0,1,1.0,0.67,50.828392


Cream of Wheat nutrition information found online [here](https://www.creamofwheat.com/product/original).

In [5]:
cereal_df.at[20, 'potass'] = 0

Quaker Oatmeal nutrition information found online [here](https://www.nutritionix.com/i/quaker-oats/old-fashioned-oats/51c54a5d97c3e6efadd60584). 
Values are scaled to 2/3 of 1 cup.

In [6]:
cereal_df.at[57, 'carbo'] = 18
cereal_df.at[57, 'sugars'] = 1

Unable to determine Almond Delight nutrition information (or even what exactly Almond Delight is!). 
We will drop this row.

In [7]:
cereal_df = cereal_df.drop([4])

---

### Clean the data:
Are any of the values off? 
1. Are ```calories``` greater than $9\cdot$ ```fat``` $+4\cdot($ ```carbo``` $+$ ```protein``` $)$? (Note that the summary statistics show that equality won't hold always.)
2. Are ```carbo``` $>$ ```sugar```?

In [15]:
cereal_df.loc[cereal_df.calories<9*cereal_df.fat+4*cereal_df.carbo+4*cereal_df.protein]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,net_carbs
54,Puffed Rice,Q,C,50,1,0,0,0.0,13.0,0,15,0,3,0.5,1.0,60.756112,13.0
57,Quaker Oatmeal,Q,H,100,5,2,0,2.7,18.0,1,110,0,1,1.0,0.67,50.828392,15.3
65,Shredded Wheat spoon size,N,C,90,3,0,0,3.0,20.0,0,120,0,1,1.0,0.67,72.801787,17.0


None of these values are dramatic discrepencies. 
We will change the total calorie count to be $9\cdot$ ```fat``` $+4\cdot($ ```carbo``` $+$ ```protein``` $)$.

In [24]:
cereal_df.at[54, 'calories']

50

In [None]:
for row in [54, 57, 65]:
    cereal_df.at[row, 'calories'] = 9*cereal_df.at[row, 'fat']+4*cereal_df.at[row, 'carbo'+4*cereal_df.protein
cereal_df.at[57, 'calories'] = 9*cereal_df.fat+4*cereal_df.carbo+4*cereal_df.protein
cereal_df.at[65, 'calories'] = 9*cereal_df.fat+4*cereal_df.carbo+4*cereal_df.protein

In [13]:
cereal_df.loc[cereal_df.carbo<cereal_df.sugars]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,net_carbs
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,-5.0
6,Apple Jacks,K,C,110,2,0,125,1.0,11.0,14,30,25,2,1.0,1.0,33.174094,10.0
14,Cocoa Puffs,G,C,110,1,1,180,0.0,12.0,13,55,25,2,1.0,1.0,22.736446,12.0
18,Count Chocula,G,C,110,1,1,180,0.0,12.0,13,65,25,2,1.0,1.0,22.396513,12.0
24,Froot Loops,K,C,110,2,1,125,1.0,11.0,13,30,25,2,1.0,1.0,32.207582,10.0
30,Golden Crisp,P,C,100,2,0,45,0.0,11.0,15,40,25,1,1.0,0.88,35.252444,11.0
52,Post Nat. Raisin Bran,P,C,120,3,1,200,6.0,11.0,14,260,25,3,1.33,0.67,37.840594,5.0
66,Smacks,K,C,110,2,1,70,1.0,9.0,15,40,25,2,1.0,0.75,31.230054,8.0


---
---

### Feature Engineering

* Net Carbs is a thing people track, lets make a column for that.
* Make a calories per cup ratio column
* 

In [9]:
cereal_df['net_carbs'] = cereal_df.carbo - cereal_df.fiber

### Scale Features out of 10

First, remove columns ```mfr```, ```type```, ```vitamins```, ```shelf```, ```weight```, & ```rating```.