In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import json

Below we standardise file formatting

In [2]:
code_to_desc = {}

for year in range(2015, 2026):
    if year > 2022:
        df = pd.read_excel(f"./Data/Income/Gross_Pay_{year}.xlsx",sheet_name=1).iloc[3:]
    else:
        df = pd.read_excel(f"./Data/Income/Gross_Pay_{year}.xls",sheet_name=1).iloc[3:]
    df.columns = df.iloc[0]
    df.columns = df.columns.astype(str)
    code_to_desc.update(df.set_index('Code')['Description'].to_dict())
    df = df.rename_axis("Council Code", axis=1)
    df = df.dropna(subset=['Code'])
    print (df.columns)
    df = df.drop(columns=["Description", "(thousand)", "change", "Mean", "25", "75"], axis=1)
    df = df.rename(columns={'Median': "50"})
    df.set_index("Code", inplace=True)
    df = df.iloc[:, :-3]
    df.index.name = None
    df = df[["10", "20", "30", "40", "50", "60", "70", "80", "90"]]
    df.to_csv(f"./Temp/Gross_Pay_{year}.csv", index=True)   

with open("./CleanedData/Council_to_Code.json", "w") as f:
    json.dump(code_to_desc, f, indent=4)

Index(['Description', 'Code', '(thousand)', 'Median', 'change', 'Mean',
       'change', '10', '20', '25', '30', '40', '60', '70', '75', '80', '90',
       'nan', 'nan', 'nan'],
      dtype='object', name='Council Code')
Index(['Description', 'Code', '(thousand)', 'Median', 'change', 'Mean',
       'change', '10', '20', '25', '30', '40', '60', '70', '75', '80', '90',
       'nan', 'nan', 'nan'],
      dtype='object', name='Council Code')
Index(['Description', 'Code', '(thousand)', 'Median', 'change', 'Mean',
       'change', '10', '20', '25', '30', '40', '60', '70', '75', '80', '90',
       'nan', 'nan', 'nan'],
      dtype='object', name='Council Code')
Index(['Description', 'Code', '(thousand)', 'Median', 'change', 'Mean',
       'change', '10', '20', '25', '30', '40', '60', '70', '75', '80', '90',
       'nan', 'nan', 'nan'],
      dtype='object', name='Council Code')
Index(['Description', 'Code', '(thousand)', 'Median', 'change', 'Mean',
       'change', '10', '20', '25', '30', '40

Next we load each dataframe into a list 

In [3]:
dfs = [pd.read_csv(f"./Temp/Gross_Pay_{year}.csv") for year in range(2016, 2026)]

for i, df in enumerate(dfs):
    df = dfs[i]
    df.set_index("Unnamed: 0", inplace=True)
    df = df.tail(-1)
    df.index.name = None
    df = df.rename_axis("Council Code", axis=1)
    dfs[i] = df

Next we take the common council codes for each dataframe so each dataframe has the same dimensions

In [4]:

common_idx = reduce(lambda a, b: a.intersection(b.index), dfs, dfs[0].index)
dfs = [df.loc[common_idx] for df in dfs]
dfs[-1]

Council Code,10,20,30,40,50,60,70,80,90
K02000001,11425,18560,24532,28591,32890,38000,44500,52809,69381
K03000001,11456,18613,24580,28646,32972,38061,44629,52929,69750
K04000001,11424,18589,24563,28627,32991,38058,44677,53162,70250
E92000001,11439,18653,24669,28769,33142,38292,44962,53630,71090
E12000001,10727,17213,22769,26092,29266,33006,38097,45254,56226
...,...,...,...,...,...,...,...,...,...
S12000029,10510,16577,22873,27610,31984,36257,42531,49383,x
S12000030,13366,20888,25654,30216,34244,37413,44045,50139,x
S12000039,11013,16971,22896,26576,30033,34887,41103,46589,x
S12000040,12198,19715,24994,28624,32535,37660,43582,50103,x


In [5]:
years = list(range(2016, 2026))
combined_df = pd.concat(dfs, axis=1)
combined_df.columns = pd.MultiIndex.from_product([years, dfs[0].columns], names=['Year', 'Analysis'])
combined_df.to_csv(f"./CleanedData/IncomeData.csv", index=True)  


In [6]:
combined_df = combined_df.rename(index=code_to_desc)
combined_df.index = combined_df.index.str.strip()
combined_df.sort_index(inplace=True)
combined_df

Year,2016,2016,2016,2016,2016,2016,2016,2016,2016,2017,...,2024,2025,2025,2025,2025,2025,2025,2025,2025,2025
Analysis,10,20,30,40,50,60,70,80,90,10,...,90,10,20,30,40,50,60,70,80,90
Aberdeen City,x,13839,18469,22476,26507,31263,37210,45487,x,7424,...,x,12521,21151,26166,30249,35483,42605,48795,57190,x
Aberdeenshire,x,9405,12749,16731,20456,24688,29589,35388,x,6480,...,x,9812,15892,22900,27478,31551,36743,43604,51817,x
Adur,x,x,x,x,x,x,x,x,x,x,...,x,x,12564,18100,22903,28238,35178,x,x,x
Amber Valley,9416,13490,16230,19898,23197,27735,31332,39645,x,7759,...,x,12570,18433,23271,26852,31100,35307,38948,46309,x
Angus,7409,11734,15517,17604,20467,23839,26642,29806,x,x,...,x,x,14207,19290,24551,27520,31569,36992,46567,x
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wychavon,x,x,x,16921,20864,23442,27868,x,x,7663,...,x,11166,19993,24996,27194,32022,36325,41273,47320,x
Wyre,x,8542,11434,15281,18155,19449,23791,x,x,x,...,x,x,15251,20382,24491,27657,29924,34803,x,x
Wyre Forest,x,x,x,x,x,x,x,x,x,x,...,x,x,15381,20861,25219,27346,29584,35842,x,x
York UA,7019,11958,15420,18240,21670,25045,29880,36288,x,x,...,x,x,18228,24917,28003,32151,37407,44894,55662,x


In [None]:
import pandas as pd
import numpy as np

df = combined_df.copy()
df = df.replace("x", np.nan)  
df = df.apply(pd.to_numeric)

years = sorted(list(set(df.columns.get_level_values(0))))
percentiles = ['10','20','30','40','50','60','70','80','90']

rows_drop_backwards = []

for idx, row in df.iterrows():

    for p in ['10','90']:
        last_value = None
        nan_run = 0

        for y in years:
            col = (y, p)

            if col not in df.columns:
                continue

            val = row[col]

            if pd.isna(val):
                nan_run += 1

                if nan_run >= 2:
                    rows_drop_backwards.append(idx)
                    break

                if last_value is not None:
                    df.at[idx, col] = last_value

            else:
                last_value = val
                nan_run = 0

        if idx in rows_drop_backwards:
            break

df = df.drop(rows_drop_backwards)
rows_drop_middle = []

for idx, row in df.iterrows():
    for p in ['50','60','70','80','90']:

        nan_run = 0

        for y in years:
            col = (y, p)

            if col not in df.columns:
                continue

            if pd.isna(row[col]):
                nan_run += 1
            else:
                nan_run = 0

            if nan_run >= 2:
                rows_drop_middle.append(idx)
                break

        if idx in rows_drop_middle:
            break

df = df.drop(rows_drop_middle)
def nearest_left(series, pos):
    for i in range(pos - 1, -1, -1):
        if pd.notna(series.iloc[i]):
            return series.iloc[i]
    return None

def nearest_right(series, pos):
    for i in range(pos + 1, len(series)):
        if pd.notna(series.iloc[i]):
            return series.iloc[i]
    return None


rows_drop_sideways = []

for idx, row in df.iterrows():

    row_vals = row.values
    s = row.copy()
    row_pos = df.index.get_loc(idx)

    for col_i in range(len(row_vals)):

        if pd.isna(row_vals[col_i]):

            left = nearest_left(s, col_i)
            right = nearest_right(s, col_i)
            if left is None and right is None:
                rows_drop_sideways.append(idx)
                break
            if left is not None and right is not None:
                val = (left + right) / 2
            elif left is not None:
                val = left
            else:
                val = right

            df.iat[row_pos, col_i] = val

df = df.drop(rows_drop_sideways)

cleaned_df = df.copy()
cleaned_df




errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead



Year,2016,2016,2016,2016,2016,2016,2016,2016,2016,2017,...,2024,2025,2025,2025,2025,2025,2025,2025,2025,2025
Analysis,10,20,30,40,50,60,70,80,90,10,...,90,10,20,30,40,50,60,70,80,90
Birmingham,8055.0,12848.0,16749.0,20191.0,23788.0,27714.0,32959.0,38544.0,47934.0,7955.0,...,67913.0,12006.0,20036.0,25280.0,29551.0,33374.0,38241.0,44963.0,53678.0,69286.0
City of Edinburgh,9279.0,15113.0,19413.0,22571.0,25971.0,30300.0,35922.0,42850.0,57613.0,9525.0,...,72820.0,14777.0,24343.0,28725.0,33171.0,38345.0,44297.0,52145.0,63839.0,82187.0
Cornwall UA,5850.0,9624.0,12667.0,15536.0,17868.0,21027.0,24795.0,29305.0,17671.5,6038.0,...,48159.0,9281.0,13572.0,18867.0,23402.0,27006.0,31230.0,35545.0,41297.0,50153.0
Derbyshire,7200.0,10943.0,14359.0,17601.0,20927.0,24306.0,28553.0,34126.0,43291.0,6897.0,...,51566.0,11995.0,16875.0,22883.0,26301.0,30000.0,34502.0,39471.0,46021.0,55441.0
Devon,6603.0,10588.0,14582.0,17756.0,20726.0,23747.0,27786.0,32728.0,40729.0,6878.0,...,50948.0,10153.0,16410.0,22368.0,25716.0,29114.0,33255.0,37779.0,44390.0,53241.0
East,6515.0,10792.0,14927.0,18622.0,22330.0,26219.0,31200.0,37716.0,48636.0,6673.0,...,61999.0,9932.0,16361.0,23099.0,27399.0,31513.0,36723.0,43000.0,50881.0,67033.0
East Midlands,7298.0,11449.0,14991.0,18014.0,21122.0,24668.0,29098.0,34814.0,44149.0,7160.0,...,56006.0,11008.0,16852.0,23095.0,26481.0,30000.0,34312.0,39606.0,46863.0,58581.0
England,7465.0,11977.0,16000.0,19640.0,23334.0,27562.0,32602.0,39450.0,51402.0,7564.0,...,66976.0,11439.0,18653.0,24669.0,28769.0,33142.0,38292.0,44962.0,53630.0,71090.0
England and Wales,7468.0,11958.0,15944.0,19530.0,23178.0,27385.0,32429.0,39143.0,50879.0,7547.0,...,66101.0,11424.0,18589.0,24563.0,28627.0,32991.0,38058.0,44677.0,53162.0,70250.0
Essex,6182.0,10303.0,14340.0,17967.0,21387.0,25176.0,30205.0,36877.0,48224.0,6420.0,...,61249.0,9559.0,15360.0,22391.0,26640.0,30898.0,35812.0,41694.0,50074.0,66286.0


In [8]:
combined_percentiles = combined_df.loc[:, [col for col in combined_df.columns if col[1] not in ['Mean']]]
combined_percentiles = combined_percentiles.apply(pd.to_numeric, errors='coerce')



In [9]:
combined_median = combined_df.loc[:, [col for col in combined_df.columns if col[1]  in ['50']]]
combined_median = combined_median.apply(pd.to_numeric, errors='coerce')

combined_median

Year,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
Analysis,50,50,50,50,50,50,50,50,50,50
Aberdeen City,26507.0,25349.0,26087.0,27435.0,27970.0,26844.0,29429.0,33319.0,35160.0,35483.0
Aberdeenshire,20456.0,20726.0,22702.0,23064.0,23814.0,23473.0,25061.0,26724.0,29736.0,31551.0
Adur,,22762.0,22914.0,21447.0,22877.0,22977.0,23563.0,28029.0,27753.0,28238.0
Amber Valley,23197.0,24060.0,22683.0,24950.0,24450.0,23301.0,23924.0,26342.0,29414.0,31100.0
Angus,20467.0,20022.0,21112.0,,21785.0,22779.0,25822.0,26604.0,28748.0,27520.0
...,...,...,...,...,...,...,...,...,...,...
Wychavon,20864.0,21401.0,22810.0,22525.0,22543.0,24343.0,,29891.0,31721.0,32022.0
Wyre,18155.0,,17902.0,19227.0,20064.0,20985.0,23043.0,24953.0,26691.0,27657.0
Wyre Forest,,18181.0,18158.0,20783.0,20293.0,22665.0,23501.0,,25605.0,27346.0
York UA,21670.0,21404.0,20681.0,23926.0,24463.0,25366.0,27576.0,29121.0,32511.0,32151.0


In [None]:

median_df = combined_median.xs('50', axis=1, level=1).copy()
median_df = median_df.apply(pd.to_numeric, errors='coerce')

rows_with_adjacent_nans = []
for idx, row in median_df.iterrows():
    row_is_nan = row.isna().values
    has_adjacent = False

    for i in range(len(row_is_nan) - 1):
        if row_is_nan[i] and row_is_nan[i+1]:
            has_adjacent = True
            break

    if has_adjacent:
        rows_with_adjacent_nans.append(idx)

clean_df = median_df.drop(rows_with_adjacent_nans).copy()

def find_left_value(series, pos):
    """Find nearest non-NaN to the left."""
    for i in range(pos - 1, -1, -1):
        if pd.notna(series.iloc[i]):
            return series.iloc[i]
    return None

def find_right_value(series, pos):
    """Find nearest non-NaN to the right."""
    for i in range(pos + 1, len(series)):
        if pd.notna(series.iloc[i]):
            return series.iloc[i]
    return None

rows_to_drop = []

for idx in clean_df.index:
    row = clean_df.loc[idx]

    for col_pos in range(len(row)):
        if pd.isna(row.iloc[col_pos]):

            left_val = find_left_value(row, col_pos)
            right_val = find_right_value(row, col_pos)
            if left_val is None and right_val is None:
                rows_to_drop.append(idx)
                break
            if left_val is not None and right_val is not None:
                new_value = (left_val + right_val) / 2.0
            elif left_val is not None:
                new_value = left_val
            else:
                new_value = right_val
            clean_df.at[idx, row.index[col_pos]] = new_value
clean_df = clean_df.drop(rows_to_drop)
for col in clean_df.columns:
    combined_median[(col, '50')] = combined_median.index.map(
        lambda x: clean_df.at[x, col] if x in clean_df.index else np.nan
    )

combined_median = combined_median.loc[clean_df.index].copy()

In [11]:
combined_median

Year,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
Analysis,50,50,50,50,50,50,50,50,50,50
Aberdeen City,26507.0,25349.0,26087.0,27435.0,27970.0,26844.0,29429.0,33319.0,35160.0,35483.0
Aberdeenshire,20456.0,20726.0,22702.0,23064.0,23814.0,23473.0,25061.0,26724.0,29736.0,31551.0
Adur,22762.0,22762.0,22914.0,21447.0,22877.0,22977.0,23563.0,28029.0,27753.0,28238.0
Amber Valley,23197.0,24060.0,22683.0,24950.0,24450.0,23301.0,23924.0,26342.0,29414.0,31100.0
Angus,20467.0,20022.0,21112.0,21448.5,21785.0,22779.0,25822.0,26604.0,28748.0,27520.0
...,...,...,...,...,...,...,...,...,...,...
Wychavon,20864.0,21401.0,22810.0,22525.0,22543.0,24343.0,27117.0,29891.0,31721.0,32022.0
Wyre,18155.0,18028.5,17902.0,19227.0,20064.0,20985.0,23043.0,24953.0,26691.0,27657.0
Wyre Forest,18181.0,18181.0,18158.0,20783.0,20293.0,22665.0,23501.0,24553.0,25605.0,27346.0
York UA,21670.0,21404.0,20681.0,23926.0,24463.0,25366.0,27576.0,29121.0,32511.0,32151.0


In [None]:
import numpy as np
import pandas as pd

if not isinstance(combined_median.columns, pd.MultiIndex):
    raise ValueError("df.columns must be a MultiIndex")

new_cols = {}

for year in combined_median.columns.levels[0]:

    year_df = combined_median[year]

    values = year_df.values.flatten()
    values = values[~np.isnan(values)]

    low_cut = np.percentile(values, 33)
    high_cut = np.percentile(values, 67)
    base_vals = year_df.mean(axis=1)

    tercile_series = base_vals.apply(
        lambda v: ( "Low" if v < low_cut else "Mid" if v < high_cut else "High" ))
    new_cols[(year, "Tercile")] = tercile_series

tercile_df = pd.DataFrame(new_cols, index=combined_median.index)
tercile_df.columns = pd.MultiIndex.from_tuples(tercile_df.columns, names=combined_median.columns.names)
df_with_tercile = pd.concat([combined_median, tercile_df], axis=1)
df_with_tercile = df_with_tercile.sort_index(axis=1)
df_with_tercile = df_with_tercile.drop(columns=df_with_tercile.columns[df_with_tercile.columns.get_level_values(1) == "50"])
df_with_tercile.to_csv(f"./CleanedData/Income_Tercile_Map.csv", index=True)   



In [None]:
df_long = combined_percentiles.stack(level=[0,1]).reset_index()
df_long.columns = ['Council', 'Year', 'Percentile', 'Revenue']

df_long['Year'] = pd.to_numeric(df_long['Year'])
df_long['Percentile'] = pd.to_numeric(df_long['Percentile'])
df_long['Revenue'] = pd.to_numeric(df_long['Revenue'])

fig = go.Figure()

x_min, x_max = df_long['Year'].min(), df_long['Year'].max()
y_min, y_max = df_long['Percentile'].min(), df_long['Percentile'].max()
z_min, z_max = df_long['Revenue'].min(), df_long['Revenue'].max()


councils = df_long['Council'].unique()
for i, council in enumerate(councils):
    df_c = df_long[df_long['Council'] == council]
    fig.add_trace(go.Scatter3d(x=df_c['Year'],y=df_c['Percentile'],z=df_c['Revenue'],mode='markers',marker=dict(size=5),name=council, visible=(i==0)))

buttons = []
for i, council in enumerate(councils):
    visible = [False]*len(councils)
    visible[i] = True
    buttons.append(dict(label=council, method="update", args=[{"visible": visible}, {"title": f"Region: {council}"}]))

fig.update_layout(
    updatemenus=[dict(active=0, buttons=buttons, x=1.1, y=0.8)],
    scene=dict(
        xaxis=dict(title='Year', range=[x_min, x_max]),
        yaxis=dict(title='Percentile', range=[y_min, y_max]),
        zaxis=dict(title='Revenue', range=[z_min, z_max]),
    ),
    title="Income Percentiles By Council By Region By Year"
)

fig.show()


  df_long = combined_percentiles.stack(level=[0,1]).reset_index()
