In [505]:
import pandas as pd
import numpy as np
import os
import warnings

In [506]:
warnings.filterwarnings("ignore")

# Functions

In [507]:
def merge_data_according_to_area_level(file_path, combined_data):
    df = pd.read_csv(file_path, encoding='utf-8')
    df.columns = df.columns.str.lower()
    if 'year' in file_path:
        return combined_data

    month = file_path.split('/')[-1].split('_')[0].split('\\')[-1]
    df['month'] = month
    if combined_data.empty:
        return df

    common_columns = list(set(df.columns) & set(combined_data.columns))
    combined_data = pd.concat([combined_data, df], axis=0, ignore_index=True, join='outer', keys=common_columns)
    return combined_data

In [508]:
def sort_area_ids(df, col_name):
    # Extract the numeric part from the 'oslaua' column
    df['Numeric_Part'] = df[col_name].str.extract(r'E(\d+)').astype(int)

    # Sort the DataFrame based on the numeric part
    df_sorted = df.sort_values(by='Numeric_Part')

    # Drop the temporary column used for sorting
    df_sorted.drop(columns=['Numeric_Part'], inplace=True)

    # Reset the index and drop the temporary index column
    df_sorted = df_sorted.reset_index()
    df_sorted = df_sorted.drop(columns=['index'])

    return df_sorted

# 1. Importing our datasets
### (A) Merge Area Levels DataSets:
(i) Create a new Empty DataSet for each of the area levels

(ii) Add all columns from the first dataset adding a new column for the month at which the data was collected

def merge_data_according_to_area_level(dataset, file_path, combined_data):
    df = pd.read_csv(file_path, encoding='utf-8')
    df.columns = df.columns.str.lower()
    if combined_data.empty:
        
    

In [509]:
folder_path_area_levels = 'AreaLevels'
borough_mon_by_mon = pd.DataFrame()
lsoa_mon_by_mon = pd.DataFrame()
osward_mon_by_mon = pd.DataFrame()
msoa_mon_by_mon = pd.DataFrame()

for file_name in os.listdir(folder_path_area_levels):
    try:
        full_file_path = os.path.join(folder_path_area_levels, file_name)
        if 'borough' in full_file_path:
            borough_mon_by_mon = merge_data_according_to_area_level(full_file_path, borough_mon_by_mon)
        elif 'lsoa' in full_file_path:
            lsoa_mon_by_mon = merge_data_according_to_area_level(full_file_path, lsoa_mon_by_mon)
        elif 'osward' in full_file_path:
            osward_mon_by_mon = merge_data_according_to_area_level(full_file_path, osward_mon_by_mon)
        else:
            msoa_mon_by_mon = merge_data_according_to_area_level(full_file_path, msoa_mon_by_mon)

    except UnicodeDecodeError as e:
        # find the files causing proble,ms and delete them if necessary
        print(f"Error reading {file_name}: UnicodeDecodeError - {e}")

In [510]:
month_to_num = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

In [511]:
borough_mon_by_mon['Month_Num'] = borough_mon_by_mon['month'].map(month_to_num)
osward_mon_by_mon['Month_Num'] = osward_mon_by_mon['month'].map(month_to_num)
msoa_mon_by_mon['Month_Num'] = msoa_mon_by_mon['month'].map(month_to_num)
lsoa_mon_by_mon['Month_Num'] = lsoa_mon_by_mon['month'].map(month_to_num)

In [512]:
borough_mon_by_mon.set_index('Month_Num', inplace=True)
borough_mon_by_mon_sorted = borough_mon_by_mon.sort_index()
osward_mon_by_mon.set_index('Month_Num', inplace=True)
osward_mon_by_mon_sorted = osward_mon_by_mon.sort_index()
msoa_mon_by_mon.set_index('Month_Num', inplace=True)
msoa_mon_by_mon_sorted = msoa_mon_by_mon.sort_index()
lsoa_mon_by_mon.set_index('Month_Num', inplace=True)
lsoa_mon_by_mon_sorted = lsoa_mon_by_mon.sort_index()

In [513]:
borough_mon_by_mon_sorted.drop(columns=['month'], inplace=True)
osward_mon_by_mon_sorted.drop(columns=['month'], inplace=True)
msoa_mon_by_mon_sorted.drop(columns=['month'], inplace=True)
lsoa_mon_by_mon_sorted.drop(columns=['month'], inplace=True)

In [514]:
col_bor = []
col_msoa = []
col_osw = []
col_lsoa = []
for col in borough_mon_by_mon_sorted:
    col_bor.append(col)

for col in msoa_mon_by_mon_sorted:
    col_msoa.append(col)

for col in osward_mon_by_mon_sorted:
    col_osw.append(col)

for col in lsoa_mon_by_mon_sorted:
    col_lsoa.append(col)

print(col_bor)

['area_id', 'weight', 'weight_perc2.5', 'weight_perc25', 'weight_perc50', 'weight_perc75', 'weight_perc97.5', 'weight_std', 'weight_ci95', 'volume', 'volume_perc2.5', 'volume_perc25', 'volume_perc50', 'volume_perc75', 'volume_perc97.5', 'volume_std', 'volume_ci95', 'fat', 'fat_perc2.5', 'fat_perc25', 'fat_perc50', 'fat_perc75', 'fat_perc97.5', 'fat_std', 'fat_ci95', 'saturate', 'saturate_perc2.5', 'saturate_perc25', 'saturate_perc50', 'saturate_perc75', 'saturate_perc97.5', 'saturate_std', 'saturate_ci95', 'salt', 'salt_perc2.5', 'salt_perc25', 'salt_perc50', 'salt_perc75', 'salt_perc97.5', 'salt_std', 'salt_ci95', 'sugar', 'sugar_perc2.5', 'sugar_perc25', 'sugar_perc50', 'sugar_perc75', 'sugar_perc97.5', 'sugar_std', 'sugar_ci95', 'protein', 'protein_perc2.5', 'protein_perc25', 'protein_perc50', 'protein_perc75', 'protein_perc97.5', 'protein_std', 'protein_ci95', 'carb', 'carb_perc2.5', 'carb_perc25', 'carb_perc50', 'carb_perc75', 'carb_perc97.5', 'carb_std', 'carb_ci95', 'fibre', 'fi

### (B) Importing area levels yearly datasets

In [515]:
borough_year = pd.read_csv('AreaLevels\year_borough_grocery.csv')
osward_year = pd.read_csv('AreaLevels\year_osward_grocery.csv')
msoa_year = pd.read_csv('AreaLevels\year_msoa_grocery.csv')
lsoa_year = pd.read_csv('AreaLevels\year_lsoa_grocery.csv')

### (C) Validation data sets


In [516]:
child_obesity_borough_2015_2016 = pd.read_csv('Validation\child_obesity_london_borough_2015-2016.csv')
child_obesity_osward_2013_2014 = pd.read_csv('Validation\child_obesity_london_ward_2013-2014.csv')
diabetes_est_osward_2016 = pd.read_csv('Validation\diabetes_estimates_osward_2016.csv')
obesity_borough_2012 = pd.read_csv('Validation\london_obesity_borough_2012.csv')
postcodes_area_ids = pd.read_csv('Validation\london_pcd2geo_2015.csv')
obseity_hospitilization_borough_2016 = pd.read_csv('Validation\obesity_hospitalization_borough_2016.csv')

### (D) Other datasets

In [517]:
food_categories = pd.read_csv('food_categories.csv')

# 2. Getting area_id to postcode values

In [518]:
postcode_to_borough = postcodes_area_ids.drop(columns=['oa11', 'lsoa11', 'msoa11', 'osward'])
postcode_to_osward = postcodes_area_ids.drop(columns=['oa11', 'lsoa11', 'msoa11', 'oslaua'])
postcode_to_msoa = postcodes_area_ids.drop(columns=['oa11', 'lsoa11', 'osward', 'oslaua'])
postcode_to_lsao = postcodes_area_ids.drop(columns=['oa11', 'osward', 'msoa11', 'oslaua'])

## Obesity and overweightness in adults

In [537]:
area_id_to_borough_name = pd.read_csv('Extra_data_sets/area_id_area_name.csv')

In [540]:
cols_nuts_obes = ['area_id','weight', 'volume','fat','sugar','saturate','protein', 'carb', 'salt', 'fibre', 'energy_fat', 'energy_sugar', 'energy_saturate', 'energy_protein', 'energy_fibre', 'energy_carb', 'energy_tot']
nuts_borough = borough_year[cols_nuts_obes]

ref: https://data.london.gov.uk/dataset/obesity-adults

In [542]:
obesity_borough_2012 = obesity_borough_2012.rename(columns={'oslaua': 'area_id'})

In [543]:
nuts_borough = nuts_borough.merge(obesity_borough_2012, on='area_id', how='outer')

In [546]:
nutrient_columns = ['weight', 'volume','fat','sugar','saturate','protein', 'carb', 'salt', 'fibre', 'energy_fat', 'energy_sugar', 'energy_saturate', 'energy_protein', 'energy_fibre', 'energy_carb', 'energy_tot']  # Replace with actual nutrient column names
for nutrient in nutrient_columns:
    correlation_obesity = nuts_borough[nutrient].corr(nuts_borough['f_obese'])
    correlation_overweight = nuts_borough[nutrient].corr(nuts_borough['f_overweight'])

In [547]:
import plotly.express as px

nutrient_columns = ['fat','sugar','saturate','protein', 'carb', 'salt', 'fibre']
obesity_correlations = []
overweight_correlations = []
for nutrient in nutrient_columns:
    obesity_correlations.append(nuts_borough[nutrient].corr(nuts_borough['f_obese']))
    overweight_correlations.append(nuts_borough[nutrient].corr(nuts_borough['f_overweight']))

correlation_df = pd.DataFrame({
    'Nutrient': nutrient_columns,
    'f_obese': obesity_correlations,
    'f_overweight': overweight_correlations
})


In [548]:
# Plotting the correlation values for f_obese
fig = px.bar(correlation_df, x='Nutrient', y='f_obese', title='Correlation Between Nutrients and Obesity')
fig.update_layout(xaxis_title='Nutrient', yaxis_title='Correlation Coefficient', xaxis={'categoryorder':'total descending'})
fig.show()

In [549]:
fig = px.bar(correlation_df, x='Nutrient', y='f_overweight', title='Correlation Between Nutrients and Being Overweight')
fig.update_layout(xaxis_title='Nutrient', yaxis_title='Correlation Coefficient', xaxis={'categoryorder':'total descending'})
fig.show()


In [550]:
import plotly.express as px
import pandas as pd

long_df = correlation_df.melt(id_vars='Nutrient', var_name='Condition', value_name='Correlation')

fig = px.bar(long_df, x='Nutrient', y='Correlation', color='Condition', barmode='group',
             text='Correlation',
             title='Correlation between Nutrients and Conditions (Obese & Overweight)',
             labels={'Nutrient': 'Nutrient', 'Correlation': 'Correlation Coefficient'})

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')

fig.update_layout(xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis_range=[-1,1])

fig.show()


## Diabetes and nutrients

In [552]:
nuts_diabetes_osward = osward_year[['area_id'] + nutrient_columns]
nuts_diabetes_osward = nuts_diabetes_osward.merge(diabetes_est_osward_2016[['area_id', 'estimated_diabetes_prevalence']], on='area_id', how='outer').dropna()

In [553]:
import plotly.express as px

diabetes_correlations = []
overweight_correlations = []

for nutrient in nutrient_columns:
    diabetes_correlations.append(nuts_diabetes_osward[nutrient].corr(nuts_diabetes_osward['estimated_diabetes_prevalence']))

correlation_df = pd.DataFrame({
    'Nutrient': nutrient_columns,
    'estimated_diabetes_prevalence' : diabetes_correlations
})


In [554]:
fig = px.bar(correlation_df, x='Nutrient', y='estimated_diabetes_prevalence', text='estimated_diabetes_prevalence',
             title='Correlation Between Nutrients and Type-2 Diabetes')
fig.update_layout(xaxis_title='Nutrient', yaxis_title='Correlation Coefficient', xaxis={'categoryorder':'total descending'})

fig.update_traces(texttemplate='%{y:.2f}', textposition='outside')

fig.update_layout(xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis_range = [-1,1])

fig.show()

## Diabetes and Overweight/Obesity??

In [555]:
osw_bor = postcode_to_borough[['pcd', 'oslaua']].merge(postcode_to_osward[['pcd','osward']], on='pcd', how='outer')

CONTINUE LATER

# Average Incomes, Taxes and Benefits by Decile Groups of ALL Households (weekly)
ref: https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/averageincomestaxesandbenefitsbydecilegroupsofallhouseholds

In [557]:
income_data_2002_2022 = pd.read_excel('Extra_data_sets\income_data.xlsx')

Conf% means the margin of error, assuming it is zero, we can drop these columns

In [561]:
income_columns_to_drop = [col for col in income_data_2002_2022.columns if "Unnamed" in str(col)]

income_data_2002_2022 = income_data_2002_2022.drop(columns=income_columns_to_drop)
income_rows_to_drop = income_data_2002_2022.index[:1]

income_data_2002_2022 = income_data_2002_2022.drop(income_rows_to_drop)
income_data_2002_2022 = income_data_2002_2022.reset_index()
income_data_2002_2022 = income_data_2002_2022.drop(columns=['index','Code'])
income_data_2002_2022 = income_data_2002_2022.dropna()

income_data_2002_2022 = income_data_2002_2022.rename(columns={'Area' : 'Area Name'})
income_data_2002_2022 = income_data_2002_2022.merge(area_id_to_borough_name, on='Area Name', how='outer')

In [569]:
cols_keep= nutrient_columns + ['area_id','weight', 'volume']

wei_vol_income_2015 = borough_year[cols_keep]
wei_vol_income_2015 = wei_vol_income_2015.merge(income_data_2002_2022[[2015,'area_id']], on='area_id', how='outer')
wei_vol_income_2015 = wei_vol_income_2015.dropna()
wei_vol_income_2015 = wei_vol_income_2015.rename(columns={2015: 'pay'})

In [573]:
import plotly.express as px

correlation_income = []
nutrient_wei_vol_columns = nutrient_columns

for val in nutrient_wei_vol_columns:
    correlation_income.append(wei_vol_income_2015[val].corr(wei_vol_income_2015['pay']))

correlation_df = pd.DataFrame({
    'Value': nutrient_wei_vol_columns,
    'Correlation' : correlation_income
})


In [574]:
fig = px.bar(correlation_df, x='Value', y='Correlation', text='Correlation',
             title='Correlation Between Income and Nutrients')
fig.update_layout(xaxis_title='Nutrient', yaxis_title='Correlation Coefficient', xaxis={'categoryorder':'total descending'})

fig.update_traces(texttemplate='%{y:.2f}', textposition='outside')

fig.update_layout(xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide',
                  yaxis_range = [-1,1])

fig.show()

In [575]:
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import iplot

# Assuming income_data_2002_2022 is already loaded and cleaned

# Drop 'area_id' column and rows with any NaN values
income_data = income_data_2002_2022.drop(columns=['area_id']).dropna()

# Convert year columns to float
year_columns = income_data.columns[1:]  # This assumes the first column is 'Area Name'
income_data[year_columns] = income_data[year_columns].apply(pd.to_numeric, errors='coerce')

# Initialize a figure
fig = go.Figure()

# Add a trace (line) for each borough
for borough in income_data['Area Name'].unique():
    fig.add_trace(
        go.Scatter(
            x=year_columns,
            y=income_data[income_data['Area Name'] == borough].iloc[0, 1:],
            mode='lines+markers',
            name=borough
        )
    )

# Update layout
fig.update_layout(
    title='Average Weekly Income Over Time by Borough',
    xaxis_title='Year',
    yaxis_title='Average Weekly Income',
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    hovermode='closest'
)

fig.update_xaxes(showgrid=True, gridwidth=1)
fig.update_yaxes(showgrid=True, gridwidth=1)
# Show plot
iplot(fig)


Kensington and chelsea have the highest avrg

In [799]:
import numpy as np
from sklearn.linear_model import LinearRegression

av_income = income_data_2002_2022.dropna()
av_income = av_income[av_income.area_id != 'E09000001']

area_ids = [val for val in av_income.area_id.values]
area_names = [name for name in av_income['Area Name'].values]

incomes_2015 = [[inc] for inc in av_income[2015].values]
nutrients_2015 = [nut for nut in borough_year[nutrient_columns].drop(0).values]

model = LinearRegression()

X = np.array(incomes_2015)
y = np.array(nutrients_2015)

model.fit(X,y)

In [864]:
import numpy as np

years = np.arange(2002, 2023)
nutrients_area_year = {}
for year in years:
    nutrients_area_year[year] = {}
    if year == 2015:
        for i, nutrient in enumerate(nutrient_columns):
            nutrients_area_year[year][nutrient] = {id_: nutrients_2015[j][i] for j, id_ in enumerate(area_ids)}
        continue
    
    incomes_yr = np.array([[inc] for inc in av_income[year].values]
    y_pred = model.predict(incomes_yr)
    for i, nutrient in enumerate(nutrient_columns):
            nutrients_area_year[year][nutrient] = {id_: y_pred[j][i] for j, id_ in enumerate(area_ids)}


In [902]:
mean_nutrients_over_year = []
for year in years:
    df = pd.DataFrame(nutrients_area_year[year])
    temp = df.mean()
    mean_nutrients_over_year.append(temp)

mean_nutrients_df = pd.concat(mean_nutrients_over_year, axis=1)
mean_nutrients_df.columns = years 

In [931]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math

cols = 2
rows = math.ceil(len(mean_nutrients_df.index) / cols)
fig = make_subplots(rows=rows, cols=cols, subplot_titles=mean_nutrients_df.index)
for i, nutrient in enumerate(mean_nutrients_df.index, start=1):
    row = ((i - 1) // cols) + 1
    col = ((i - 1) % cols) + 1
    
    fig.add_trace(
        go.Scatter(x=mean_nutrients_df.columns, y=mean_nutrients_df.loc[nutrient], mode='lines+markers', name=nutrient),
        row=row, col=col
    )

fig.update_layout(height=400*rows, width=1000, title_text="Nutrient Trends Over Years", showlegend=False)
fig.update_xaxes(title_text='Year')
fig.update_yaxes(title_text='Average Value')

fig.show()
