In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import plotly.express as px

In [2]:
warnings.filterwarnings("ignore")

## Importing our datasets
The datasets provided are split into Area Level datasets which include datasets from 420M food items purched by 1.6M Clubcard (Tesco loyalty cards) in the Greater London area.

1. Area Level Datasets: The area level datasets are split into Bourough, Ward, Middle Layer Super Output Area (MSOA) and Lower Layer Super Output Area (LSAO) which are respective of the size of the area from largest to smallest. Splitting it into different areas is more convenient for analysis as the Greater London area contain 411 different Tesco stores. These datasets include month by month purchase data for the 2015 year and the yearly average purchase data which is the average over all the months. The ones chosen for this analysis are the yearly datasets for bourough and ward area levels.

2. Validation Datasets: The validation datasets icnlude datasets for child obesity in london boroughs from 2015/2016, child obesity in London wards from 2013/2014, diabetes estimates in London wards in 2016, obesity in boroughs in 2012, obesity hospitilization in boroughs 2016 and a postcode to arealevel id dataset. The ones chosen for this analysis are diabetes estimates in London wards in 2016 and obesity in boroughs in 2012.



In [3]:
# Area Level Datasets
borough_year = pd.read_csv('AreaLevels\year_borough_grocery.csv')
osward_year = pd.read_csv('AreaLevels\year_osward_grocery.csv')

In [4]:
# Validation Datasets
diabetes_est_osward_2016 = pd.read_csv('Validation\diabetes_estimates_osward_2016.csv')
obesity_borough_2012 = pd.read_csv('Validation\london_obesity_borough_2012.csv')

## Obesity and overweightness in adults

Different bouroughs are only listed with IDs but the datasets did not provide the names of the area, this dataset includes the corresponding area name to the id.

ref: https://data.london.gov.uk/dataset/obesity-adults

In [5]:
# Get Area Names from area ids
area_id_to_borough_name = pd.read_csv('Extra_data_sets/area_id_area_name.csv')

In the area level dataset, each nutrient is given a weight that is measured in grams. These nutrients include: weight, volume, fat, sugar, saturate, protein, carbohydrates, salt and fibre. This weight is measured as:

$$ Weight(a) = \frac{\Sigma_{p\in P_a} grams(p)}{|P_a|} $$

where a is the nutrient and $P_a$ is the population

In [6]:
# Keeping only the area_id and average nutrient weigths for the year 2015 in the borough area
cols_nuts_obes = ['area_id','weight', 'volume','fat','sugar','saturate','protein', 'carb', 'salt', 'fibre']
nutrients_borough = borough_year[cols_nuts_obes]

In [7]:
# Renaming the id column in the obesity dataset in order to merge
obesity_borough_2012 = obesity_borough_2012.rename(columns={'oslaua': 'area_id'})
# Merging the nutrient weights in the year with obesity data
nutrients_borough = nutrients_borough.merge(obesity_borough_2012, on='area_id', how='outer')

We can calculate the correlation coefficient between:
1. Weight of nutrients and Obesity
2. Weight of nutrients and being Overweight

This coefficient is a statistical measure of a linear relationship between two variables. It can take a values ranging between -1 and +1. A correlation coefficient of +1 shows perfect positive correlation (direct relationship) and -1 a perfect negative relationship (inverse relationship). Calculating this value for the given measures tell us how nutritional intake affects obesity and being overweight.

In [8]:
# Nutrient weight columns
nutrient_columns = ['fat','sugar','saturate','protein', 'carb', 'salt', 'fibre']

# Initialize the correlation arrays for obesity and overweight correlation
obesity_correlations = []
overweight_correlations = []
for nutrient in nutrient_columns:
    # Calculcualting correlation between each nutrient weigth and obesity in boroughs
    obesity_correlations.append(nutrients_borough[nutrient].corr(nutrients_borough['f_obese']))
    # Calculcualting correlation between each nutrient weigth and being overweight in boroughs    
    overweight_correlations.append(nutrients_borough[nutrient].corr(nutrients_borough['f_overweight']))

# Create dataframe for ease of plotting
correlation_nutrients_obesity_overweight = pd.DataFrame({
    'Nutrient': nutrient_columns,
    'Obesity': obesity_correlations,
    'Overweight': overweight_correlations
})

In [9]:
# Plotting a bar chart showing how each nutrient correlates to Obesity in boroughs
fig = px.bar(correlation_nutrients_obesity_overweight, x='Nutrient', y='Obesity', title='Correlation Between Nutrients and Obesity', text='Obesity')
# Set x and y axis titles, sort bars in decending order
fig.update_layout(xaxis_title='Nutrient', yaxis_title='Correlation Coefficient', xaxis={'categoryorder':'total descending'})
# Add correlation coef above each bar
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis_range=[-1,1])

fig.show()

In [10]:
# Plotting a bar chart showing how each nutrient correlates to being Overweight in boroughs
fig = px.bar(correlation_nutrients_obesity_overweight, x='Nutrient', y='Overweight', title='Correlation Between Nutrients and Being Overweight', text='Overweight')
# Set x and y axis titles, sort bars in decending order
fig.update_layout(xaxis_title='Nutrient', yaxis_title='Correlation Coefficient', xaxis={'categoryorder':'total descending'})
# Add correlation coef above each bar
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis_range=[-1,1])
fig.show()


In [11]:
# Plotting both correlations for obesity and overweight on the same graph
long_df = correlation_nutrients_obesity_overweight.melt(id_vars='Nutrient', var_name='Condition', value_name='Correlation')

fig = px.bar(long_df, x='Nutrient', y='Correlation', color='Condition', barmode='group',
             text='Correlation',
             title='Correlation between Nutrients and Conditions (Obese & Overweight)',
             labels={'Nutrient': 'Nutrient', 'Correlation': 'Correlation Coefficient'})

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')

fig.update_layout(xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis_range=[-1,1])

fig.show()


The temporal discrepancy between the obesity and product (borough area level) datasets, collected in 2012 and 2015 respectively, was not explicitly addressed in the correlation analysis. This decision was based on the underlying assumption that the variables under investigation exhibit stable relationships over time, rendering the impact of the intervening years negligible for the purpose of this initial exploration. 

## Type 2 Diabetes and nutrients
Calculating the correlation coefficient between nutrients and how it affected Type-2 Diabetes in the population. This was done using the Ward 2015 tesco data as the diabetes data was collected for London wards in 2016. As before, the temporal discrepancy between the datasets was not explicitly addressed in the correlation analysis. This decision was based on the underlying assumption that the variables under investigation exhibit stable relationships over time, rendering the impact of the intervening years negligible for the purpose of this initial exploration. 

In [12]:
# Creating a new dataframe to hold only area ids and nutrient columns
nuts_diabetes_osward = osward_year[['area_id'] + nutrient_columns]
# Merginf with diabetes estimates in Wards on area ids
nuts_diabetes_osward = nuts_diabetes_osward.merge(diabetes_est_osward_2016[['area_id', 'estimated_diabetes_prevalence']], on='area_id', how='outer').dropna()

In [13]:
# Intilizing correlation array
diabetes_correlations = []
for nutrient in nutrient_columns:
    # Calculating correlation between each nutrient and the est of diabetes in that area
    diabetes_correlations.append(nuts_diabetes_osward[nutrient].corr(nuts_diabetes_osward['estimated_diabetes_prevalence']))

# Creating a correlation dataframe for ease of plotting
correlation_diabetes_nutrients = pd.DataFrame({
    'Nutrient': nutrient_columns,
    'estimated_diabetes_prevalence' : diabetes_correlations
})


In [14]:
# Plotting a bar chart showing how each nutrient correlates with diabetes estimates in Wards
fig = px.bar(correlation_diabetes_nutrients, x='Nutrient', y='estimated_diabetes_prevalence', text='estimated_diabetes_prevalence',
             title='Correlation Between Nutrients and Type-2 Diabetes')
# Set x and y axis titles, sort bars in decending order
fig.update_layout(xaxis_title='Nutrient', yaxis_title='Correlation Coefficient', xaxis={'categoryorder':'total descending'})
# Add correlation coef above each bar
fig.update_traces(texttemplate='%{y:.2f}', textposition='outside')
# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis_range = [-1,1])

fig.show()

# Average Incomes, Taxes and Benefits by Decile Groups of ALL Households (weekly)
ref: https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/averageincomestaxesandbenefitsbydecilegroupsofallhouseholds

In [15]:
income_data_2002_2022 = pd.read_excel('Extra_data_sets\income_data.xlsx')

Conf% means the margin of error, assuming it is zero, we can drop these columns

In [16]:
income_columns_to_drop = [col for col in income_data_2002_2022.columns if "Unnamed" in str(col)]

income_data_2002_2022 = income_data_2002_2022.drop(columns=income_columns_to_drop)
income_rows_to_drop = income_data_2002_2022.index[:1]

income_data_2002_2022 = income_data_2002_2022.drop(income_rows_to_drop)
income_data_2002_2022 = income_data_2002_2022.reset_index()
income_data_2002_2022 = income_data_2002_2022.drop(columns=['index','Code'])
income_data_2002_2022 = income_data_2002_2022.dropna()

income_data_2002_2022 = income_data_2002_2022.rename(columns={'Area' : 'Area Name'})
income_data_2002_2022 = income_data_2002_2022.merge(area_id_to_borough_name, on='Area Name', how='outer')

In [17]:
cols_keep= nutrient_columns + ['area_id','weight', 'volume']

wei_vol_income_2015 = borough_year[cols_keep]
wei_vol_income_2015 = wei_vol_income_2015.merge(income_data_2002_2022[[2015,'area_id']], on='area_id', how='outer')
wei_vol_income_2015 = wei_vol_income_2015.dropna()
wei_vol_income_2015 = wei_vol_income_2015.rename(columns={2015: 'pay'})
wei_vol_income_2015 = wei_vol_income_2015[wei_vol_income_2015.area_id != 'E09000001']

In [18]:
wei_vol_income_2015['protein'].corr(wei_vol_income_2015['pay'])


0.4241809496163161

In [28]:
import plotly.express as px

correlation_income = []
for val in nutrient_columns:
    correlation_income.append(wei_vol_income_2015[val].corr(wei_vol_income_2015['pay']))

correlation_df = pd.DataFrame({
    'Value': nutrient_columns,
    'Correlation' : correlation_income
})

In [29]:
fig = px.bar(correlation_df, x='Value', y='Correlation', text='Correlation',
             title='Correlation Between Income and Nutrients')
fig.update_layout(xaxis_title='Nutrient', yaxis_title='Correlation Coefficient', xaxis={'categoryorder':'total descending'})

fig.update_traces(texttemplate='%{y:.2f}', textposition='outside')

fig.update_layout(xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis_range = [-1,1])

fig.show()

In [22]:
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import iplot

# Assuming income_data_2002_2022 is already loaded and cleaned

# Drop 'area_id' column and rows with any NaN values
income_data = income_data_2002_2022.drop(columns=['area_id']).dropna()

# Convert year columns to float
year_columns = income_data.columns[1:]  # This assumes the first column is 'Area Name'
income_data[year_columns] = income_data[year_columns].apply(pd.to_numeric, errors='coerce')

# Compute the average income for each year across all boroughs
average_income_by_year = income_data[year_columns].mean()

# Initialize a figure
fig = go.Figure()

# Add a trace for the average income
fig.add_trace(
    go.Scatter(
        x=year_columns,
        y=average_income_by_year,
        mode='lines+markers',
        name='Average Income'
    )
)

# Update layout
fig.update_layout(
    title='Average Weekly Income Over Time Across All Boroughs',
    xaxis_title='Year',
    yaxis_title='Average Weekly Income (£)',
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    hovermode='closest'
)

fig.update_xaxes(showgrid=True, gridwidth=1)
fig.update_yaxes(showgrid=True, gridwidth=1)
# Show plot
iplot(fig)


Kensington and chelsea have the highest avrg

In [23]:
import numpy as np
from sklearn.linear_model import LinearRegression

av_income = income_data_2002_2022.dropna()
av_income = av_income[av_income.area_id != 'E09000001']

area_ids = [val for val in av_income.area_id.values]
area_names = [name for name in av_income['Area Name'].values]

incomes_2015 = [[inc] for inc in av_income[2015].values]
nutrients_2015 = [nut for nut in borough_year[nutrient_columns].drop(0).values]

model = LinearRegression()

X = np.array(incomes_2015)
y = np.array(nutrients_2015)

model.fit(X,y)

In [24]:
import numpy as np

years = np.arange(2002, 2023)
nutrients_area_year = {}
for year in years:
    nutrients_area_year[year] = {}
    if year == 2015:
        for i, nutrient in enumerate(nutrient_columns):
            nutrients_area_year[year][nutrient] = {id_: nutrients_2015[j][i] for j, id_ in enumerate(area_ids)}
        continue
    
    incomes_yr = np.array([[inc] for inc in av_income[year].values])
    y_pred = model.predict(incomes_yr)
    for i, nutrient in enumerate(nutrient_columns):
            nutrients_area_year[year][nutrient] = {id_: y_pred[j][i] for j, id_ in enumerate(area_ids)}


In [25]:
mean_nutrients_over_year = []
for year in years:
    df = pd.DataFrame(nutrients_area_year[year])
    temp = df.mean()
    mean_nutrients_over_year.append(temp)

mean_nutrients_df = pd.concat(mean_nutrients_over_year, axis=1)
mean_nutrients_df.columns = years 

In [26]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math

cols = 2
rows = math.ceil(len(mean_nutrients_df.index) / cols)
fig = make_subplots(rows=rows, cols=cols, subplot_titles=mean_nutrients_df.index)
for i, nutrient in enumerate(mean_nutrients_df.index, start=1):
    row = ((i - 1) // cols) + 1
    col = ((i - 1) % cols) + 1
    
    fig.add_trace(
        go.Scatter(x=mean_nutrients_df.columns, y=mean_nutrients_df.loc[nutrient], mode='lines+markers', name=nutrient),
        row=row, col=col
    )

fig.update_layout(height=400*rows, width=1000, title_text="Nutrient Trends Over Years", showlegend=False)
fig.update_xaxes(title_text='Year')
fig.update_yaxes(title_text='Average Value (g)')

fig.show()


In [27]:
# Assuming mean_nutrients_df is already loaded and cleaned

for nutrient in mean_nutrients_df.index:
    # Initialize a figure for the current nutrient
    fig = go.Figure()
    
    # Add trace for the nutrient
    fig.add_trace(
        go.Scatter(
            x=mean_nutrients_df.columns,  # Assuming columns are years or time periods
            y=mean_nutrients_df.loc[nutrient],  # Values for the nutrient
            mode='lines+markers',
            name=nutrient.capitalize()  # Capitalize the first letter of the nutrient
        )
    )
    
    # Update layout for each nutrient's figure
    fig.update_layout(
        title=f'{nutrient.capitalize()} Trends Over Years',  # Capitalize the first letter of the nutrient in the title
        xaxis_title='Year',
        yaxis_title='Average Value (g)',
        showlegend=False
    )
    
    # Show figure
    fig.show()
