In [1]:
import pandas as pd
import os

In [6]:
# Set the file paths
da15_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/dwelling_ages_gm2015.csv"
da21_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/dwelling_ages_gm2021.csv"
fire_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/fire_gm.csv"
flooding16_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/flooding2016.csv"
flooding18_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/flooding2018.csv"
flooding20_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/flooding2020.csv"
accomm01_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/gm_accomm2001.csv"
accomm11_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/gm_accomm2011.csv"
accomm21_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/gm_accomm2021.csv"
hosp_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/hospitals.csv"
schools_file_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/schools_gm.csv"

# Read the CSV files
d15 = pd.read_csv(da15_file_path)
d21 = pd.read_csv(da21_file_path)
fire = pd.read_csv(fire_file_path)
flood16 = pd.read_csv(flooding16_file_path) # LAD area. No preprocessing left.
flood18 = pd.read_csv(flooding18_file_path) # LAD area. No preprocessing left.
flood20 = pd.read_csv(flooding20_file_path) # LAD area. No preprocessing left.
accomm01 = pd.read_csv(accomm01_file_path) # Complete
accomm11 = pd.read_csv(accomm11_file_path) # Complete
accomm21 = pd.read_csv(accomm21_file_path) # Complete
hospitals = pd.read_csv(hosp_file_path)
schools = pd.read_csv(schools_file_path)

# Base directory
base_dir = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/processed/Buildings/hh_by_year"

# Years range
years = range(2004, 2017 + 1)

# Dictionary to hold DataFrames
hh_data = {}

# Load each file
for year in years:
    file_name = f"hh_gm_{year}.csv"
    file_path = os.path.join(base_dir, file_name)
    hh_data[year] = pd.read_csv(file_path)

### 1. LSOA areas per year

Making sure every table contains the same number of LSOA

+ 2001: 1646
+ 2011: 1673
+ 2021: 1702

In [9]:
lsoa2021 = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/lookup/processed/gm_lsoas2021.csv")

### 1.1. Buildings 

In [22]:
## Dwelling Ages from 2015
# First, create a base from lsoa2021 with the columns you want to merge on
base = lsoa2021[['lsoa21cd', 'lsoa21nm']].rename(columns={
    'lsoa21cd': 'LSOA_ID',
    'lsoa21nm': 'lsoa21nm'  # Already the same, but keeping for clarity
})

# Add YEAR and TYPE from existing d15 (we assume they're the same throughout the dataset)
default_year = d15['YEAR'].iloc[0]
default_type = d15['TYPE'].iloc[0]

base['YEAR'] = default_year
base['TYPE'] = default_type

# Merge with d15 using outer join and prioritising d15 data
d15_full = pd.merge(base, d15, on=['LSOA_ID', 'lsoa21nm', 'YEAR', 'TYPE'], how='left')

# Fill in values from base where d15 had missing rows
# This will automatically give NaN for all columns not in the original base
# No need to do anything more unless you want to explicitly sort

# Optional: check how many new rows were added
n_missing = d15_full['BP_PRE_1900'].isna().sum()
print(f"{n_missing} LSOAs were missing in the original d15 and have been added with NaNs.")

95 LSOAs were missing in the original d15 and have been added with NaNs.


In [23]:
# Set the export path
export_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/dwelling_ages_gm2015_full.csv"

# Export to CSV
d15_full.to_csv(export_path, index=False)

In [24]:
## Dwelling Ages from 2021
# Create base with LSOA ID and name from lsoa2021
base21 = lsoa2021[['lsoa21cd', 'lsoa21nm']].rename(columns={
    'lsoa21cd': 'LSOA_ID',
    'lsoa21nm': 'lsoa21nm'
})

# Set default values for year and TYPE from d21
default_year_21 = d21['year'].iloc[0]
default_type_21 = d21['TYPE'].iloc[0]

# Add these to the base
base21['year'] = default_year_21
base21['TYPE'] = default_type_21

# Merge with d21
d21_full = pd.merge(base21, d21, on=['LSOA_ID', 'lsoa21nm', 'year', 'TYPE'], how='left')

# Check how many LSOAs were missing originally
n_missing_21 = d21_full['dwe_p45pc'].isna().sum()
print(f"{n_missing_21} LSOAs were missing in the original d21 and have been added with NaNs.")

# Export the result
export_path_21 = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/dwelling_ages_gm2021_full.csv"
d21_full.to_csv(export_path_21, index=False)

print(f"File exported to: {export_path_21}")

66 LSOAs were missing in the original d21 and have been added with NaNs.
File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/dwelling_ages_gm2021_full.csv


In [27]:
## Fire stations: already has 1702 areas

# Add YEAR column
fire['YEAR'] = 2024

# Optional: reorder columns to place YEAR after lsoa21nm
cols = ['LSOA_ID', 'lsoa21nm', 'YEAR'] + [col for col in fire.columns if col not in ['LSOA_ID', 'lsoa21nm', 'YEAR']]
fire = fire[cols]

# Export path
fire_export_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/fire_gm_2024.csv"

# Export to CSV
fire.to_csv(fire_export_path, index=False)

print(f"File exported to: {fire_export_path}")

File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/fire_gm_2024.csv


In [36]:
## Hospitals

# Add YEAR column
hospitals['YEAR'] = 2017

# Optional: reorder columns to keep it tidy
cols = ['LSOA_ID', 'lsoa21nm', 'YEAR', 'n_hospitals', 'TYPE']
hospitals = hospitals[cols]

# Export path
hospitals_export_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/hospitals_gm_2017.csv"

# Export
hospitals.to_csv(hospitals_export_path, index=False)

print(f"File exported to: {hospitals_export_path}")

File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/hospitals_gm_2017.csv


In [41]:
# Merge lsoa21nm from lsoa2021
schools = schools.merge(
    lsoa2021[['lsoa21cd', 'lsoa21nm']].rename(columns={'lsoa21cd': 'LSOA_ID'}),
    on='LSOA_ID',
    how='left'
)

# Add YEAR
schools['YEAR'] = 2025

# Reorder columns
cols = ['LSOA_ID', 'lsoa21nm', 'YEAR'] + [col for col in schools.columns if col not in ['LSOA_ID', 'lsoa21nm', 'YEAR']]
schools = schools[cols]

# Export path
schools_export_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/schools_gm_2025.csv"

# Export
schools.to_csv(schools_export_path, index=False)

print(f"File exported to: {schools_export_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/schools_gm_2025.csv


## Adding MSOA areas

In [2]:
msoa21 = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/lookup/processed/gm_lsoas2021.csv")

In [3]:
da15 = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/buildings/dwelling_ages_gm2015_full.csv")
da21 = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/buildings/dwelling_ages_gm2021_full.csv")
fire = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/buildings/fire_gm_2024.csv")
a01 = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/buildings/gm_accomm2001.csv")
a11 = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/buildings/gm_accomm2011.csv")
a21 = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/buildings/gm_accomm2021.csv")
hops = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/buildings/hospitals_gm_2017.csv")
school = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/buildings/schools_gm_2025.csv")

In [12]:
# Merge msoa21 info into da15
merged = da15.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged = merged.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged = merged.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged.columns.tolist()
# Find indices to split and reorder
before = ['LSOA_ID', 'lsoa21nm']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged = merged[before + insert + after]

# Export path
da15_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/dwelling_ages_gm2015_full.csv"

# Export
merged.to_csv(da15_path, index=False)

print(f"File exported to: {da15_path}")

File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/dwelling_ages_gm2015_full.csv


In [15]:
# Merge msoa21 info into da21
merged_da21 = da21.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged_da21 = merged_da21.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged_da21 = merged_da21.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged_da21.columns.tolist()
before = ['LSOA_ID', 'lsoa21nm']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged_da21 = merged_da21[before + insert + after]

# Export path
da21_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/dwelling_quality_gm2021_full.csv"

# Export
merged_da21.to_csv(da21_path, index=False)

print(f"File exported to: {da21_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/dwelling_quality_gm2021_full.csv


In [18]:
# Merge msoa21 info into fire
merged_fire = fire.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged_fire = merged_fire.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged_fire = merged_fire.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged_fire.columns.tolist()
before = ['LSOA_ID', 'lsoa21nm']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged_fire = merged_fire[before + insert + after]

# Export path
fire_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/fire_gm_2024.csv"

# Export
merged_fire.to_csv(fire_path, index=False)

print(f"File exported to: {fire_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/fire_gm_2024.csv


In [21]:
# Merge msoa21 info into a01
merged_a01 = a01.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged_a01 = merged_a01.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged_a01 = merged_a01.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged_a01.columns.tolist()
before = ['LSOA_ID', 'LSOA Name']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged_a01 = merged_a01[before + insert + after]

# Export path
a01_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/gm_accomm_01.csv"

# Export
merged_a01.to_csv(a01_path, index=False)

print(f"File exported to: {a01_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/gm_accomm_01.csv


In [25]:
# Merge msoa21 info into a11
merged_a11 = a11.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged_a11 = merged_a11.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged_a11 = merged_a11.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged_a11.columns.tolist()
before = ['LSOA_ID', 'LSOA Name']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged_a11 = merged_a11[before + insert + after]

# Export path
a11_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/gm_accomm2011.csv"

# Export
merged_a11.to_csv(a11_path, index=False)

print(f"File exported to: {a11_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/gm_accomm2011.csv


In [28]:
# Merge msoa21 info into a21
merged_a21 = a21.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged_a21 = merged_a21.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged_a21 = merged_a21.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged_a21.columns.tolist()
before = ['LSOA_ID', 'LSOA Name']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged_a21 = merged_a21[before + insert + after]

# Export path
a21_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/gm_accomm2021.csv"

# Export
merged_a21.to_csv(a21_path, index=False)

print(f"File exported to: {a21_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/gm_accomm2021.csv


In [31]:
# Merge msoa21 info into hops
merged_hops = hops.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged_hops = merged_hops.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged_hops = merged_hops.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged_hops.columns.tolist()
before = ['LSOA_ID', 'lsoa21nm']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged_hops = merged_hops[before + insert + after]

# Export path
hops_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/hospitals_gm_2017.csv"

# Export
merged_hops.to_csv(hops_path, index=False)

print(f"File exported to: {hops_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/hospitals_gm_2017.csv


In [35]:
# Merge msoa21 info into school
merged_school = school.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged_school = merged_school.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged_school = merged_school.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged_school.columns.tolist()
before = ['LSOA_ID', 'lsoa21nm']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged_school = merged_school[before + insert + after]

# Export path
school_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/school_gm_2025.csv"

# Export
merged_school.to_csv(school_path, index=False)

print(f"File exported to: {school_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/school_gm_2025.csv


In [37]:
ps = pd.read_csv("/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/ps_2024.csv")

In [39]:
# Merge msoa21 info into ps
merged_ps = ps.merge(
    msoa21[['lsoa21cd', 'msoa21cd', 'msoa21nm']],
    left_on='LSOA_ID',
    right_on='lsoa21cd',
    how='left'
)

# Rename MSOA columns
merged_ps = merged_ps.rename(columns={
    'msoa21cd': 'MSOA_ID',
    'msoa21nm': 'msoa21nm'
})

# Drop redundant column
merged_ps = merged_ps.drop(columns=['lsoa21cd'])

# Reorder columns
cols = merged_ps.columns.tolist()
before = ['LSOA_ID', 'lsoa21nm']
insert = ['MSOA_ID', 'msoa21nm']
after = [col for col in cols if col not in before + insert]

# Final column order
merged_ps = merged_ps[before + insert + after]

# Export path
ps_path = "/Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/ps_2024.csv"

# Export
merged_ps.to_csv(ps_path, index=False)

print(f"File exported to: {ps_path}")


File exported to: /Users/alexander/Documents/MSc Data Science/S2/Applying Data Science/dp/final_versions/updatedMSOA/ps_2024.csv
