In [2]:
import pandas as pd
import seaborn as sns
import seaborn.objects as so
import matplotlib.pyplot as plt

GCSE_national_data_path = "Data/2223_national_data_provisional.csv"

GCSE_national_data = pd.read_csv(GCSE_national_data_path)

GCSE_national_data.head()

Unnamed: 0,time_period,time_identifier,geographic_level,country_code,country_name,version,establishment_type,breakdown,gender,prior_attainment,...,t_ent_hist_geog,pt_ent_hist_geog,t_multilan_e,pt_multilan_e,t_entered_art,pt_entered_art,t_entered_music_qual,pt_entered_music_qual,t_entered_arts_qual,pt_entered_arts_qual
0,202223,Academic year,National,E92000001,England,Provisional,Academies and free schools,Prior attainment,Boys,High prior,...,7231,14.0,2620,5.1,13198,25.5,4669,9.0,17316,33.4
1,202223,Academic year,National,E92000001,England,Provisional,Academies and free schools,Prior attainment,Boys,Low prior,...,3621,7.0,495,1.0,17413,33.7,3226,6.2,24433,47.3
2,202223,Academic year,National,E92000001,England,Provisional,Academies and free schools,Prior attainment,Boys,Mid prior,...,12610,10.5,2070,1.7,33929,28.2,8205,6.8,48951,40.6
3,202223,Academic year,National,E92000001,England,Provisional,Academies and free schools,Total,Boys,Total,...,24890,10.4,6435,2.7,69226,29.0,17217,7.2,96840,40.5
4,202223,Academic year,National,E92000001,England,Provisional,Academies and free schools,Prior attainment,Girls,High prior,...,5250,9.6,3252,6.0,27569,50.5,4851,8.9,30687,56.2


In [3]:
# clean and sort the overall data which contains characteristics such as ethnicity, gender etc. 
# for the second analysis on subjects entered at GCSE by gender

subject_data_df = GCSE_national_data[(GCSE_national_data["time_period"]==202223) & (GCSE_national_data["establishment_type"] == "All schools") & (GCSE_national_data["breakdown"] == "Total")]


subject_data_reduced_df = subject_data_df[["gender","t_schools","t_pupils",
                                           "t_entbasics","pt_entbasics","t_triplesci_e","pt_triplesci_e",
                                           "t_eballsci_ptq_ee","pt_eballsci_ptq_ee",
                                           "t_ent_comb_sci","pt_ent_comb_sci","t_ent_hist_geog","pt_ent_hist_geog",
                                           "t_entered_art","pt_entered_art","t_entered_music_qual","pt_entered_music_qual",
                                           "t_multilan_e","pt_multilan_e","avg_att8","avg_p8score"]]

subject_data_reduced_df = subject_data_reduced_df.dropna(how="any")

subject_data_reduced_df = subject_data_reduced_df.rename(columns={"t_schools": "total_num_schools", 
                                                                  "t_pupils": "total_num_pupils",
                                                                  "t_entbasics": "tot_num_entering_english_and_maths_GCSEs",
                                                                  "pt_entbasics":"perc_pupils_entering_english_and_maths_GCSEs",
                                                                  "t_triplesci_e":"tot_num_entering_triple_science",
                                                                  "pt_triplesci_e":"perc_pupils_entering_triple_science",
                                                                  "t_eballsci_ptq_ee":"tot_num_entering_one_of_phys_chem_bio_compsci",
                                                                  "pt_eballsci_ptq_ee":"perc_pupils_entering_one_of_phys_chem_bio_compsci",
                                                                  "t_ent_comb_sci":"tot_num_entering_combined_science",
                                                                  "pt_ent_comb_sci":"perc_pupils_entering_combined_science",
                                                                  "t_ent_hist_geog":"tot_num_entering_hist_and_geog",
                                                                  "pt_ent_hist_geog":"perc_pupils_entering_hist_and_geog",
                                                                  "t_entered_art":"tot_num_entering_any_arts_subj",
                                                                  "pt_entered_art":"perc_pupils_entering_any_arts_subj",
                                                                  "t_entered_music_qual":"tot_num_entering_music_qualification",
                                                                  "pt_entered_music_qual":"perc_pupils_entering_music_qualification",
                                                                  "t_multilan_e":"tot_num_entering_more_than_one_lang",
                                                                  "pt_multilan_e":"perc_pupils_entering_more_than_one_lang",
                                                                  "avg_att8":"avg_attainment8_score:",
                                                                  "avg_p8score":"avg_progress8_score:"})

subject_data_reduced_df = subject_data_reduced_df.reset_index(drop=True)

subject_data_reduced_df.head()


Unnamed: 0,gender,total_num_schools,total_num_pupils,tot_num_entering_english_and_maths_GCSEs,perc_pupils_entering_english_and_maths_GCSEs,tot_num_entering_triple_science,perc_pupils_entering_triple_science,tot_num_entering_one_of_phys_chem_bio_compsci,perc_pupils_entering_one_of_phys_chem_bio_compsci,tot_num_entering_combined_science,...,tot_num_entering_hist_and_geog,perc_pupils_entering_hist_and_geog,tot_num_entering_any_arts_subj,perc_pupils_entering_any_arts_subj,tot_num_entering_music_qualification,perc_pupils_entering_music_qualification,tot_num_entering_more_than_one_lang,perc_pupils_entering_more_than_one_lang,avg_attainment8_score:,avg_progress8_score:
0,Boys,5258,345201,306059,88.7,85025,24.6,85452,24.8,219593,...,32556,9.4,97520,28.3,24138,7.0,10205,3.0,42.1,-0.21
1,Girls,5213,326255,296793,91.0,81828,25.1,81943,25.1,215215,...,20294,6.2,160144,49.1,21866,6.7,13723,4.2,47.2,0.1
2,Total,5717,671456,602852,89.8,166853,24.8,167395,24.9,434808,...,52850,7.9,257664,38.4,46004,6.9,23928,3.6,44.6,-0.06


In [12]:
subject_data_reduced_df.columns

Index(['gender', 'total_num_schools', 'total_num_pupils',
       'tot_num_entering_english_and_maths_GCSEs',
       'perc_pupils_entering_english_and_maths_GCSEs',
       'tot_num_entering_triple_science',
       'perc_pupils_entering_triple_science',
       'tot_num_entering_one_of_phys_chem_bio_compsci',
       'perc_pupils_entering_one_of_phys_chem_bio_compsci',
       'tot_num_entering_combined_science',
       'perc_pupils_entering_combined_science',
       'tot_num_entering_hist_and_geog', 'perc_pupils_entering_hist_and_geog',
       'tot_num_entering_any_arts_subj', 'perc_pupils_entering_any_arts_subj',
       'tot_num_entering_music_qualification',
       'perc_pupils_entering_music_qualification',
       'tot_num_entering_more_than_one_lang',
       'perc_pupils_entering_more_than_one_lang', 'avg_attainment8_score:',
       'avg_progress8_score:'],
      dtype='object')

In [None]:
To analyze the gender differences between different GCSE subjects, you can utilize Python to conduct exploratory data analysis (EDA) and statistical tests. Here’s a step-by-step guide:

Steps:
1. Data Collection:
Obtain a dataset containing GCSE results, including information about subjects and gender.
2. Data Preprocessing:
Load the data into Python using Pandas.
Clean the data, removing any irrelevant columns or rows.
3. Exploratory Data Analysis (EDA):
Group the data by subject and gender.
Visualize the distribution of grades for each subject by gender.
Compute summary statistics and create visualizations (like boxplots or histograms) to compare the performance of different genders in various subjects.
4. Statistical Testing:
Perform statistical tests (such as t-tests or ANOVA) to determine if there are significant differences in grades between genders for specific subjects.

In [None]:
# Visualize distribution of grades by subject and gender
sns.boxplot(x='', y='grade', hue='gender', data=gcse_data_cleaned)
plt.title('Distribution of Grades by Subject and Gender')
plt.xticks(rotation=45)
plt.show()

# Compute summary statistics for each subject by gender
summary_stats = gcse_data_cleaned.groupby(['subject', 'gender'])['grade'].describe()
print(summary_stats)
