In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
#ML
from prophet import Prophet
sns.set(font_scale=1.5)

## Read in data

In [None]:
df = pd.read_csv('./data/ABBREV.csv')
df['Total_Fat_(g)'] = df['FA_Mono_(g)'] + df['FA_Sat_(g)']
df = df[["NDB_No", "Shrt_Desc", "Energ_Kcal", "Protein_(g)", "Carbohydrt_(g)", "Fiber_TD_(g)", "Sugar_Tot_(g)", "Calcium_(mg)", "Sodium_(mg)", "'Total_Fat_(g)"]].copy()
df = df.rename(columns={'Energ_Kcal':'Calories',  'Shrt_Desc':'Name', 'NDB_No':'id', 'Carbohydrt_(g)':'Carbs_(g)', 'Protein_(g)':'Protein_(g)', 'Fiber_TD_(g)':'Fiber_(g)', 'Sugar_Tot_(g)':'Sugar_(g)', 'Calcium_(mg)':'Calcium_(mg)', 'Sodium_(mg)':'Sodium_(mg)'})
df.head()

In [None]:
#Create heatmap to check correlation in data
correlation = df[["Calories", 'Carbs_(g)', 'Protein_(g)', "Sugar_(g)", "Calcium_(mg)", "Mono_Fat_(g)", "Saturated_Fat_(g)", "Sodium_(mg)"]].copy()
sns.set_theme(style="white")
corr = correlation.corr(method = 'pearson',  # The method of correlation
                  min_periods = 1 )
corr.style.background_gradient(cmap='coolwarm')

#### Notes: It appears that there isn't any strong correlations between any of the fields. The highest is between calories and mono fat, then sugar and carbs.

## Cheese

In [None]:
cheese = df[df["Name"].str.startswith("CHEESE,")]
cheese = cheese.sort_values(by='Calories', ascending=False)
#cheese.to_csv('./data/cheese.csv')

In [None]:
sns.set_theme(style="white")
corr = cheese.corr(method = 'pearson',  # The method of correlation
                  min_periods = 1)
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#Scatter plot comparing Calories with Saturated Fat in Cheese
sns.set(rc={"figure.figsize":(20, 5)})
scatter = sns.scatterplot(data=cheese, x='Calories', y='Saturated_Fat_(g)', legend='auto', s=50)
scatter.set_title("Correlation between Calories and Saturated Fat in Cheese", fontdict={'size': 20, 'weight': 'bold'})
scatter.set_xlabel('Calories', fontdict={'size': 15})
scatter.set_ylabel('Saturated Fat (g)', fontdict={'size': 15})
plt.ylim(-2, 25)
plt.xlim(0, 500)
plt.show()

### Notes: Strong correlation between fat and calorie content in cheese

## Milk

In [None]:
milk = df[df["Name"].str.startswith("MILK,")]
milk = milk.sort_values(by='Calories', ascending=False)
#milk.to_csv('./data/milks.csv')
milk.head()

In [None]:
sns.set_theme(style="white")
corr = milk.corr(method = 'pearson', min_periods = 1)
corr.style.background_gradient(cmap='coolwarm')

#### Notes: High correlation among many fields, especially sugar, protein, carbs, and calcium

### Are "lowfat milks" really low in fat?

In [None]:
#milk = milk.sort_values(by='Saturated_Fat_(g)', ascending=False)
fat = ['NONFAT', 'FAT FREE', 'LOWFAT', 'LOW FAT']
pattern = '|'.join(fat)

lowfat_milk = milk[milk["Name"].str.contains(pattern)]
lowfat_milk.to_csv('./data/lowfatmilks.csv')
