In [None]:
import pandas as pd
import seaborn as sb
import numpy as np

In [None]:
datapath = "./data/"
# the data that links Local Authority (LA) codes to actual LA names you can recognise
la_codes_file='Local_Authority_Districts_December_2017_Names_and_Codes_in_the_United_Kingdom.csv'
la_codes=pd.read_csv(datapath+la_codes_file,delimiter=',')

In [None]:
filter = la_codes['LAD17NM'] == 'County Durham' # filter for whatever LA you like here
durham_code = la_codes[filter]['LAD17CD'].iloc[0]

In [None]:
# attainment statistics broken down by LA
# I don't know what 'ud' signifies yet
la_ud_file = 'ks2_2019_revised_la_ud.xlsx'
print("reading in the file: " + datapath+la_ud_file)

# the file has a few worksheets
# worksheet 'la ud' has the data, worksheet 'la ud metadata' gives info about the data columns and values
df_la_ud = pd.read_excel(datapath+la_ud_file,sheet_name='la ud')
df_la_ud_meta = pd.read_excel(datapath+la_ud_file,sheet_name='la ud metadata')


In [None]:
# a long-winded way of finding names related to maths (i.e. containing "mat")
# but it's the only way I know
math_filter =[]
for name in df_la_ud_meta['col_name']:
    math_filter.append("mat" in name)

In [None]:
pd.options.display.max_colwidth = 100 # use pandas option for columns to display enough data to show all of the 'label' column
# looking to see what the columns are for maths to see what might be interesting
df_la_ud_meta.loc[math_filter,['col_name','label']]

In [None]:
# how many rows of data for each geographic level
df_la_ud['geographic_level'].value_counts()

In [None]:
# drop the rows that are not at local authority level
# they have no entry in the 'new_la_code' column
df_la_ud=df_la_ud[~df_la_ud['new_la_code'].isna()]

# check that I got rid of what I thought I was above
df_la_ud['geographic_level'].value_counts()

## Durham Data

#### I'm not doing anything other than looking at it and deciding what columns I'm going to use for the national data

In [None]:
# make a subset of just the Durham LA data
# and have a look at it
durham_filter=df_la_ud['new_la_code'] == durham_code
durham_ud = df_la_ud.loc[durham_filter,:]
durham_ud['breakdown'].value_counts()

In [None]:
durham_ud['free_school_meals'].value_counts()

In [None]:
# after playing around a bit looking at columns I've decided to focus on this subset for now
columns=['new_la_code','disadvantaged','breakdown','gender','t_mat_elig','t_mat_exp','pt_mat_exp','t_mat_high','pt_mat_high','t_mat_notachieved','pt_mat_notachieved']

In [None]:
durham_math_ud=durham_ud[columns]

In [None]:
# a smaller subset of columns
columns2=['new_la_code','disadvantaged','breakdown','gender','t_mat_elig','t_mat_exp','t_mat_high','t_mat_notachieved']

In [None]:
columns3 = columns2 + ['t_mat_elig','t_matscore_elig','t_matscore']
print(columns3)

In [None]:
durham_math_ud.loc[durham_math_ud['breakdown']=='disadvantaged',columns3]

## Maths data for England

In [None]:
# get a dataset of all the relevant maths columns
la_math_ud=df_la_ud[columns]

In [None]:
# select the data that is looking at the breakdown between disadvantaged and non-disadvantaged
# and just look at the totals rather than the gender breakdown for now
totals_per_la = la_math_ud.loc[(la_math_ud['breakdown']=='disadvantaged') & (la_math_ud['gender']=='Total'),:]
# check that the filetr has worked - we should have the same amount of rows for each
totals_per_la['disadvantaged'].value_counts()

In [None]:
totals_per_la.sort_values(by = 'new_la_code') # sort by LA to view them together here
# this view should show two rows per Local Authority

In [None]:
# group the results by LA code and disadvantaged indicator and sum based on those groupings
# for the purpose of viewing the data here
# totals_per_la.groupby(['new_la_code','disadvantaged']).sum()

## get the percentage totals of disadvantaged v advantaged

In [None]:
# get the x,y axes for expected 
# x is disadvantaged, y is not disadvantaged
filter_disadvantaged=totals_per_la['disadvantaged']=='Disadvantaged'
filter_nondisadvantaged=totals_per_la['disadvantaged']=='DisadvantagedAllOther'
x_series_exp=totals_per_la.loc[filter_disadvantaged,'pt_mat_exp']
x_series_exp.index=totals_per_la.loc[filter_disadvantaged,'new_la_code'].values
y_series_exp=totals_per_la.loc[~filter_disadvantaged,'pt_mat_exp']
y_series_exp.index=totals_per_la.loc[~filter_disadvantaged,'new_la_code'].values

In [None]:
assert (filter_disadvantaged ^ filter_nondisadvantaged).sum() == len(totals_per_la) # just checking my filter logic

In [None]:
x_series_exp = x_series_exp.sort_index()
y_series_exp = y_series_exp.sort_index()

In [None]:
## do the same for above expected
x_series_high=totals_per_la.loc[filter_disadvantaged,'pt_mat_high']
x_series_high.index = totals_per_la.loc[filter_disadvantaged,'new_la_code'].values
y_series_high=totals_per_la.loc[~filter_disadvantaged,'pt_mat_high']
y_series_high.index = totals_per_la.loc[~filter_disadvantaged,'new_la_code'].values

In [None]:
x_series_high = x_series_high.sort_index()
y_series_high = y_series_high.sort_index()

In [None]:
x_series_high

In [None]:
x_series_durham = [x_series_high[durham_code],x_series_exp[durham_code]]
y_series_durham = [y_series_high[durham_code],y_series_exp[durham_code]]

In [None]:
x_series_durham,y_series_durham

### find the LAs below the line on the graph

In [None]:
below_line_points = x_series_exp >= y_series_exp

In [None]:
(x_series_exp[below_line_points].values,y_series_exp[below_line_points].values)

## plot scatter graph of disadvantage vs not


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10,4))

# 1st plot
plt.subplot(1, 2, 1)

# set up the plot style and axes
plt.style.use('seaborn')
plt.xlim(left=40,right=101)
plt.ylim(bottom=40,top=101)
plt.xlabel('disadvantaged, %')
plt.ylabel('not disadvantaged, %')
plt.title('% pupils reaching expected standard by LA, maths',size=14)

# plot a line to show disadvantaged == non-disadvantaged
plt.plot(np.arange(0,101),np.arange(0,101),c='g')

# plot all
plt.scatter(x_series_exp,y_series_exp, c='r',marker='+')

plt.scatter(x_series_high,y_series_high,c='y',marker='+')

# plot the LAs where pt_disadvataged > pt_advantaged
plt.scatter(x_series_exp[below_line_points],y_series_exp[below_line_points],c='y',marker='+')

# plot durham
plt.scatter(x_series_durham,y_series_durham, c='b')

# 2nd plot
plt.subplot(1,2,2)

# set up the plot style and axes
plt.style.use('seaborn')
plt.xlim(left=0,right=40)
plt.ylim(bottom=0,top=70)
plt.xlabel('disadvantaged, %')
plt.ylabel('not disadvantaged, %')
plt.title('% pupils reaching higher standard by LA, maths',size=14)

# plot a line to show disadvantaged == non-disadvantaged
plt.plot(np.arange(0,101),np.arange(0,101),c='g')

# plot all
plt.scatter(x_series_high,y_series_high, c='r',marker = '+')

# plot durham
plt.scatter(x_series_durham,y_series_durham, c='b')

# plot school


In [None]:
totals_per_la

In [None]:
columns_bar_disad = ['pt_mat_notachieved','pt_mat_exp','pt_mat_high']
column_names_disad=['NA_disad','EXP_disad','HIGH_disad']
columns_bar_not = columns_bar_disad[::-1]
column_names_not = ['NA_not','EXP_not','HIGH_not']

# disadvantaged data
df_bar = totals_per_la.loc[totals_per_la['disadvantaged']=='Disadvantaged',columns_bar_disad]
df_bar.columns=column_names_disad
df_bar.index = totals_per_la.loc[totals_per_la['disadvantaged']=='Disadvantaged','new_la_code']
df_bar = df_bar.sort_index()

# sort and add not disadvantaged data
temp_df=totals_per_la.loc[totals_per_la['disadvantaged']=='DisadvantagedAllOther',columns_bar_not]
temp_df.columns=column_names_not
temp_df.index = totals_per_la.loc[totals_per_la['disadvantaged']=='DisadvantagedAllOther','new_la_code']
temp_df=temp_df.sort_index()
df_bar[columns_bar_not]=temp_df

In [None]:
df_bar

In [None]:
df_bar.iloc[0:20,0:3]

In [None]:
df_bar.iloc[0:20,0:3].plot.barh(stacked=True,alpha=0.5)

In [None]:
df_bar.iloc[0:20,3:].plot.barh(stacked=True,alpha=0.5)

In [None]:
# set up the plot style and axes
plt.style.use('seaborn')
plt.xlim(left=0,right=101)
plt.ylim(bottom=0,top=101)
plt.xlabel('disadvantaged, %')
plt.ylabel('not disadvantaged, %')
plt.title('% pupils reaching each level, by LA, maths',size=14)

# plot a line to show disadvantaged == non-disadvantaged
plt.plot(np.arange(0,101),np.arange(0,101),c='g')

# plot all
plt.scatter(x_series_high,y_series_high, c='r')

plt.scatter(x_series_exp,y_series_exp, c='y')

# plot durham
plt.scatter(x_series_durham,y_series_durham, c='b')

# could draw lines showing movement of each LA from last year