# Interactive Visualizations of Predictors

In [1]:
import pandas as pd
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, output_file, show
import pandas as pd
import numpy as np
from bokeh.models import ColumnDataSource,HoverTool
from bokeh.models import (ColumnDataSource,HoverTool,LinearColorMapper,BasicTicker,PrintfTickFormatter,ColorBar)
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
from bokeh.io import output_notebook, show, curdoc, push_notebook

output_notebook()

data = pd.read_csv('nutrition_raw_anonymized_data.csv')
data = data.drop('ID',axis=1)
data.head(10)

ImportError: Missing required dependencies ['numpy']

#### Loaded data into dataframe

In [None]:
cols = data.columns.tolist()[0:26]
cols.remove('belly')
for col in cols:
    data[col].replace(('Yes', 'No'), (1, 0), inplace=True)

In [None]:
data.head(10)

In [None]:
temp = pd.Series()

for col in data.columns.tolist()[26:282]:
    if col.find('FREQ')>0:
        temp = data[col]
    if col.find('QUAN')>0:
        data[col] = temp.astype(int) * data[col].astype(int)
        data = data.drop(temp.name,axis=1)
        


In [None]:
data.head(10)

In [None]:
data['belly'].replace(('Innie', 'Outie'), (1, 0), inplace=True)

In [None]:
data.head(10)

#### Converted categorical values to binary values

In [None]:
correlations = data.corr()

se = pd.Series(correlations.columns.tolist())
correlations['Feature_1'] = se.values

correlations.head(10)

#### Determined correlations between variables

In [None]:
correlations = correlations.set_index('Feature_1')
correlations.columns.name = 'Feature_2'
pivoted_df = pd.DataFrame(correlations.stack(), columns=['Correlation']).reset_index()

In [None]:
pivoted_df.head(10)

In [None]:
#targets = data.columns.tolist()[:26]
targets = ['DT_VITD','DT_VITK','DT_VITC','DT_THIA','DT_SEL','DT_RIBO','DT_VITB6','DT_TFOL','DT_VB12','DT_CALC','DT_IRON','DT_MAGN','DT_PHOS','DT_POTA','DT_SODI','DT_ZINC','DT_COPP','T_FLAVONOIDS','BCOMPLEXTYPEVITSAMOUNT','ANTIOXIDANTCOMBOAMOUNT','cancer','diabetes','heart_disease','ever_smoked','currently_smoke','smoke_often','smoke_rarely','never_smoked','quit_smoking','left_hand','right_hand','cat','dog']
#targets = ['cancer','diabetes','heart_disease']
new_df = pd.DataFrame()
for target in targets:
    new_df = new_df.append(pivoted_df[pivoted_df['Feature_1']==target])
pivoted_df = new_df

#### Pivoted dataframe for columndatasource definition

In [None]:
source = ColumnDataSource(pivoted_df)
colormap = LinearColorMapper(palette=['#bdbdbd','#969696','#a50f15'],
                             low=abs(pivoted_df['Correlation']).min(), high=abs(pivoted_df['Correlation']).max())

#Making lists of countries and years to define the figure
f1 = list(correlations.index)
f2 = targets

#Defining figure object and styling it
p = figure(title="Correlations",
           x_range=f2, y_range=list(f1[::-1]),
           plot_height=1200, plot_width=900,
           x_axis_location="above", toolbar_location='below', tools="hover,pan,wheel_zoom,box_zoom,reset,save")

p.xaxis.axis_label = 'Features'
p.yaxis.axis_label = 'Features'
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"


In [None]:
#Defining plot point
r = p.rect(x="Feature_1", y="Feature_2", height=1, width=1, source=source, fill_color={'field': 'Correlation', 'transform': colormap},
       line_color=None)

#Defining legend and adding it to figure
legend = ColorBar(color_mapper=colormap, ticker=BasicTicker(desired_num_ticks=5),
                  formatter=PrintfTickFormatter(format="%d%%"), major_label_text_font_size="1pt", 
                  location=(0, 0), 
                  )
p.add_layout(legend, 'right')

In [None]:
#Defining hover tool
p.select_one(HoverTool).tooltips = [('Feature 1 & Feature 2', '@Feature_1 @Feature_2'),
                                   ('Correlation', '@Correlation')]

In [None]:
show(p)


#### Note: 
#### 1. Please hover on the rectangular glyphs in the linear color mapper above to learn which factors converge at a point of correlation. The ones in red signify high correlation. However, sometimes they aren't very useful in our analysis if they are either part of the same idealogical construct, or if they are obvious or if they're correlations between identical factors from each axis.

#### 2. The X axis does not contain all attributes because bokeh threw an error stating that ioPub data rate exceeded when I tried to cramp around 150 variables in the X range, let alone all 1000(approx) variables. Therefore, only features that I desired to focus on were used for the plot above.

### There are many interesting correlations observed between variables here.

#### 1. Diabetic folks tend to consume a lot of plain non-fat yogurt. This proves the prevalence of the common misconception among the general population that fat-free food are perfectly healthy and that sugar consumption is fine. According to study, it is white sugar, the most widely used form of sugar, that is very detrimental to health. Sadly, this is an ingredient not just in "sugary" food but even something as innocous as ketchup!

#### 2. Beer and coffee seem to be more desired by smokers than the rest. It could be that these beverages go well with cigarettes.

#### 3. Smokers are also observed to be big fans of cream cheese, hotdogs, steak, butter, corn, sausage, bacon and soy. Considering how all of these are either meat or something that goes hand in hand with it, it stands to reason that smokers are highely likely to be non-vegetarians.

#### 4. Another interesting note is that while smokers tend to go for cream cheese, the ones who quit smoking prefer cottage cheese. It's very probable that there's something about cheese that smokers, perhaps unconsiously, like, that in order to stick to their healthy lifestyle after kicking the habit, switch to the nutritious cottage cheese. It's also worth noting that quitters of smoking not just tend to give up this vice but often decide to eat healthy. This statement is fairly made concrete by the observation that they tend to eat legumes, refried beans, whole-grain crackers and vegetable soup more than those who never smoked.

#### 5. Cat owners are found to have more affinity towards coffee than dog owners. It is very possible that cat owners are more susceptible to being awaken at odd hours of the night, leading to excessive coffee consumption. I argue that it could be because they're sleep-deprived and have to rely more on coffee to be productive.

#### 6. Points worth mentioning about dog owners include their relative inclination towards fish other than salmon and tuna, and brown rice. One can hypothesize that since dogs are high-maintenance pets, owners sometimes tend to buy them cheaper fishes and brown rice as opposed to red meat to cut costs.

#### 7. Interestingly, dog owners are obeserved to have higher worktimes than cat owners. Although this contradicts the fact that dog owners are more likely to return home sooner than cat owners to play with their significantly more sociable pets, it could be that dog owners are happier, more motivated and also like their furry friends, tend to work hard and satisfy the alpha of their pack.

#### While it'd be a more fascinating study to observe characteristics of patients of cancer, diabetes and heart disease and make strong inferences, given the small dataset of 54 samples, it's hard to find good correlations between them and even harder for machine learning models to learn them well enough to make decently accurate predictions. Therefore, as an alternative, since this dataset has rather strong, reliable correlations between nutrients and food, I did a little research and noted down the nutrients, a lack of which, causes cancer. Using that information, I tracked down food products in this dataset that are rich in these said nutrients and built predictors that are trained to predict their quantities in a person's system based on how many units of these food products they consume per week.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [None]:
from bokeh import charts, plotting, models
plotting.output_notebook()

In [None]:
def update(eggs,cheese,salmon,milk,cocoa,tuna,shellfish,beef,steak,bacon,avocado,walnut,tomato,nuts,soy,tofu,butter,potatoes,hotdog,chicken,cereal,broccoli,greens,berries,beans,vegsoup,legumes):
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.linear_model import LinearRegression
    
    data = pd.read_csv('nutrition_raw_anonymized_data.csv')
    data = data.drop('ID',axis=1)
    cols = data.columns.tolist()[0:26]
    cols.remove('belly')
    for col in cols:
        data[col].replace(('Yes', 'No'), (1, 0), inplace=True)
    temp = pd.Series()

    for col in data.columns.tolist()[26:282]:
        if col.find('FREQ')>0:
            temp = data[col]
        if col.find('QUAN')>0:
            data[col] = temp.astype(int) * data[col].astype(int)
            data = data.drop(temp.name,axis=1)
    data['belly'].replace(('Innie', 'Outie'), (1, 0), inplace=True)
    
    model = LinearRegression()
    model.fit(data[['EGGSQUAN','SLICEDCHEESEQUAN','SALMONQUAN','D_MILK','COCOAQUAN']], data["DT_VITD"])
    vit_d = model.predict(np.array([eggs,cheese,salmon,milk,cocoa]).reshape(1,-1))
    model = LinearRegression()
    model.fit(data[['EGGSQUAN','MILKQUAN','TUNAQUAN','SHELLFISHQUAN','BEEFPORKDISHQUAN','STEAKQUAN','BACONSAUSAGEQUAN']], data["DT_VB12"])
    vit_b12 = model.predict(np.array([eggs,milk,tuna,shellfish,beef,steak,bacon]).reshape(1,-1))
    model = LinearRegression()
    model.fit(data[['AVOCADOQUAN','WALNUTSQUAN','TOMATOJUICEQUAN','PF_NUTSDS','M_SOY_ND','GROUP_CREAM_CHEESE_SOUR_CREAM_DIP_TOTAL_GRAMS','GROUP_TOFU_OR_TEMPEH_TOTAL_GRAMS','GROUP_BUTTER_AT_TABLE_TOTAL_GRAMS']], data["BCOMPLEXTYPEVITSAMOUNT"])
    vit_bcomp = model.predict(np.array([avocado,walnut,tomato,nuts,soy,cheese,tofu,butter]).reshape(1,-1))
    model = LinearRegression()
    model.fit(data[['POTATOESQUAN','HOTDOGQUAN','BEEFPORKDISHQUAN','FRIEDORBREADEDCHICKENQUAN','D_CHEESE','PSEGGS']], data["DT_SEL"])
    sel = model.predict(np.array([potatoes,hotdog,beef,chicken,cheese,eggs]).reshape(1,-1))
    model = LinearRegression()
    model.fit(data[['WHOLEGRAINCEREALQUAN','BROCCOLIQUAN','COOKEDGREENSQUAN','SWEETPOTATOESQUAN','BERRIESQUAN','BEANSQUAN','VEGETABLESOUPQUAN','A_NUT_S','V_LEGUMES']], data["DT_MAGN"])
    mag = model.predict(np.array([cereal,broccoli,greens,potatoes,berries,beans,vegsoup,nuts,legumes]).reshape(1,-1))
    
    datalist = [
    {'Nutrient': 'Vitamin D', 'gm': float(vit_d)},
    {'Nutrient': 'Vitamin B12', 'gm': float(vit_b12)},
    {'Nutrient': 'Vitamin B Complex', 'gm': float(vit_bcomp)},
    {'Nutrient': 'Selenium', 'gm': float(sel)},
    {'Nutrient': 'Magnesium', 'gm': float(mag)}
    ]

    df = pd.DataFrame(datalist)
    bar = charts.Bar(data=df, values='gm', label='Nutrient',
               color=['red'], width=600, height=600, legend=False)
    line = bar.line([0.5, 1.5, 2.5, 3.5, 4.5], [data['DT_MAGN'].mean(),data['DT_SEL'].mean(),data['BCOMPLEXTYPEVITSAMOUNT'].mean(),data['DT_VB12'].mean(),data['DT_VITD'].mean()], line_width=2)
    show(bar,notebook_handle=True)
    push_notebook()

#### Please use the sliders below to specify the amount of each of those food items you consume per week. 
#### The bar chart below will be updated as you use the sliders. 
#### The update may take a few seconds sometimes since linear regression models under the hood take in values from these sliders and return predicted values of each nutrient in the dataset, lack of which leads to cancer.

In [None]:
interact(update,eggs=IntSlider(min=1,max=30,step=1,value=1),cheese=IntSlider(min=1,max=30,step=1,value=1),salmon=IntSlider(min=1,max=30,step=1,value=1),
         milk=IntSlider(min=1,max=30,step=1,value=1),cocoa=IntSlider(min=1,max=30,step=1,value=1),tuna=IntSlider(min=1,max=30,step=1,value=1),shellfish=IntSlider(min=1,max=30,step=1,value=1),
         beef=IntSlider(min=1,max=30,step=1,value=1),steak=IntSlider(min=1,max=30,step=1,value=1),bacon=IntSlider(min=1,max=30,step=1,value=1),
         avocado=IntSlider(min=1,max=30,step=1,value=1),walnut=IntSlider(min=1,max=30,step=1,value=1),tomato=IntSlider(min=1,max=30,step=1,value=1),nuts=IntSlider(min=1,max=30,step=1,value=1),
         soy=IntSlider(min=1,max=30,step=1,value=1),tofu=IntSlider(min=1,max=30,step=1,value=1),butter=IntSlider(min=1,max=30,step=1,value=1),potatoes=IntSlider(min=1,max=30,step=1,value=1),
         hotdog=IntSlider(min=1,max=30,step=1,value=1),chicken=IntSlider(min=1,max=30,step=1,value=1),cereal=IntSlider(min=1,max=30,step=1,value=1),
         broccoli=IntSlider(min=1,max=30,step=1,value=1),greens=IntSlider(min=1,max=30,step=1,value=1),berries=IntSlider(min=1,max=30,step=1,value=1),
         beans=IntSlider(min=1,max=30,step=1,value=1),vegsoup=IntSlider(min=1,max=30,step=1,value=1),legumes=IntSlider(min=1,max=30,step=1,value=1)
        )

#### The blue line connects points in 2-D space that indicate average levels of each of these nutrients in the human body.
#### Please use 'Wheel Zoom' and 'Pan' tools to zoom in, zoom out and navigate if necessary to see what the average amount of a nutrient is. 
#### Also, please be advised that nutrients like vitamin B Complex and vitamin B12 are typically in much lesser quantities than minerals like magnesium and selenium. So please zoom in and zoom out accordingly for more clarity.
#### While this bar chart doesn't help one in determining if one is susceptible to cancer, it depicts what the average levels of these cancer-preventing nutrients are and what your estimated levels are in comparison.