In [None]:
# importing libraries
import pandas as pd
import json

In [None]:
# loading council to code mapping
json1_file = open('./CleanedData/Council_to_Code.json')
# read json file
json1_str = json1_file.read()
# convert json string to dictionary
code_dict = json.loads(json1_str)

In [None]:
# loading income tercile data
df_tercile = pd.read_csv("./CleanedData/Income_Tercile_Map.csv")
# mapping council names to codes
df_tercile = df_tercile.tail(-1)
# replace council names with codes
df_tercile.rename(columns = {"Year": "Council Code"}, inplace=True)
df_tercile

Unnamed: 0,Council Code,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
1,Aberdeen City,High,High,High,High,High,High,High,High,High,High
2,Aberdeenshire,Low,Low,Mid,Mid,Mid,Mid,Mid,Mid,Mid,Mid
3,Adur,Mid,Mid,Mid,Low,Low,Low,Low,Mid,Low,Low
4,Amber Valley,High,High,Mid,High,Mid,Low,Low,Low,Mid,Mid
5,Angus,Low,Low,Low,Low,Low,Low,Mid,Low,Mid,Low
...,...,...,...,...,...,...,...,...,...,...,...
359,Wychavon,Mid,Mid,Mid,Mid,Low,Mid,Mid,High,High,Mid
360,Wyre,Low,Low,Low,Low,Low,Low,Low,Low,Low,Low
361,Wyre Forest,Low,Low,Low,Low,Low,Low,Low,Low,Low,Low
362,York UA,Mid,Mid,Low,Mid,Mid,Mid,High,Mid,High,Mid


In [None]:
# create dictionary mapping council codes to 2016 income tercile
tercile = dict(zip(df_tercile["Council Code"], df_tercile["2016"]))


In [None]:
# loop through years to clean migration data
for year in range (2016, 2025):
    # load data based on year
    if (year < 2022):
        df= pd.read_csv(f"./Temp/Migration_{year}.csv")
    else:
        df= pd.read_excel(f"./Data/Migration/Migration_{year}.xlsx")
# create dictionary mapping council codes to income tercile for the year        
    tercile = dict(zip(df_tercile["Council Code"], df_tercile[f"{year}"]))
    # renaming columns and cleaning data
    df.rename(columns={'outla': "Origin"}, inplace=True)
    df.rename(columns={'inla': "Destination"}, inplace=True)
    # drop unnecessary columns
    if (year < 2022):
        df.drop(columns=["Year", "sex"], axis=1, inplace=True)
    else:
        df.drop(columns=["year", "sex"], axis=1, inplace=True)
    # replace council names with codes
    df.replace({"Destination": code_dict}, inplace=True)
    df.replace({"Origin": code_dict}, inplace=True)
    # strip whitespace from Origin and Destination columns
    df["Origin"] = df["Origin"].str.strip()
    df["Destination"] = df["Destination"].str.strip()
    # map council codes to income terciles
    df.replace({"Origin": tercile}, inplace=True)
    # aggregate age groups
    df["0-16"] = df.loc[:, "Age_0":"Age_16"].sum(axis=1)
    df["17-34"] = df.loc[:, "Age_17":"Age_34"].sum(axis=1)
    df["35-54"] = df.loc[:, "Age_35":"Age_54"].sum(axis=1)
    df["55-66"] = df.loc[:, "Age_55":"Age_66"].sum(axis=1)
    df["67+"] = df.loc[:, "Age_67":].sum(axis=1)
    # drop unnecessary age columns
    df.drop(columns=[c for c in df.columns if c not in ["Origin", "Destination", "0-16", "17-34", "35-54", "55-66", "67+"]], inplace=True)
    # reorder and sort data
    df = df[["Destination", "Origin", '0-16', "17-34", "35-54", "55-66", "67+"]] 
    # group by Origin and Destination and sum values
    df.sort_values(by="Destination", inplace=True)
    # filter for valid income terciles
    df = df[df["Origin"].isin(["Mid", "Low", "High"])]
    # filter for valid income terciles
    df = df.groupby(["Origin", "Destination"], as_index=False).sum()
    # sort and save cleaned data
    df.sort_values(by="Destination", inplace=True)
    # save to csv
    df.to_csv(f"./CleanedData/Migration_Cleaned_{year}.csv", index=True)  

In [None]:
# combining all years into a single dataframe
age_cols = ['0-16', '17-34', '35-54', '55-66', '67+']
# initialize list to hold dataframes for each year
all_years = []
# loop through years to process cleaned data
for year in range(2016, 2025):
    # load cleaned data
    df = pd.read_csv(f"./Temp/Migration_Cleaned_{year}.csv")
    # group by Destination and Origin to calculate sums
    income_sums = (df.groupby(['Destination', 'Origin'])[age_cols].sum().sum(axis=1).unstack('Origin').add_prefix('income_'))
    # group by Destination to calculate age group sums
    age_sums = (df.groupby('Destination')[age_cols].sum())
    # combine income and age sums
    result = income_sums.join(age_sums)
    # create multi-index columns for Origin and age groups
    combo = (df.groupby(['Destination', 'Origin'])[age_cols].sum().unstack('Origin'))
    # rename columns to reflect Origin and age group
    combo.columns = [f'{origin}_{age}' for age, origin in combo.columns]
    # join all data into final dataframe for the year
    final_df = result.join(combo)
    # modify index to include year
    final_df.index = final_df.index.map(lambda x: f"{x}_{year}")
    # append to list of all years
    all_years.append(final_df)
# concatenate all years into a single dataframe
full_df = pd.concat(all_years)


In [None]:
# save the final combined dataframe to csv
full_df.to_csv(f"./CleanedData/Income_Inequality_Regression_Ready.csv", index=True)   
