In [1]:
import pandas as pd
import numpy as np
import python_functions as pf

In [2]:
PISA_m = pd.read_excel('data/2000-2022_m_r.xls', sheet_name='mathematics', header=11)
PISA_r = pd.read_excel('data/2000-2022_m_r.xls', sheet_name='reading', header=11)
PISA_sc = pd.read_excel('data/2000-2022_sc.xls', sheet_name='science', header=11)

In [3]:
PISA_m = PISA_m.drop('Unnamed: 0', axis=1)
PISA_r = PISA_r.drop('Unnamed: 0', axis=1)
PISA_sc = PISA_sc.drop('Unnamed: 0', axis=1)

---

Put column names in lower case, snake case and rename:

In [4]:
dataframes = [PISA_m, PISA_r, PISA_sc]

pf.columns_lower_snake_case(dataframes)

Index(['year', 'jurisdiction', 'average', 'standard_error'], dtype='object')
------------------------------
Index(['year', 'jurisdiction', 'average', 'standard_error'], dtype='object')
------------------------------
Index(['year', 'jurisdiction', 'average', 'standard_error'], dtype='object')
------------------------------


[                                                  year  \
 0                                                 2022   
 1                                                 2022   
 2                                                 2022   
 3                                                 2022   
 4                                                 2022   
 ..                                                 ...   
 785                     ‡ Reporting standards not met.   
 786                                   — Not available.   
 787                                  † Not applicable.   
 788  NOTE: B-S-J-G (China) refers to the four PISA ...   
 789  SOURCE: Organization for Economic Cooperation ...   
 
                              jurisdiction     average standard_error  
 0    Selected countries and jurisdictions  437.628559       0.266429  
 1            International Average (OECD)  472.358125       0.397751  
 2                               Australia  487.084254       1.779614  
 3

In [5]:
PISA_m.sample(5)

Unnamed: 0,year,jurisdiction,average,standard_error
108,2018,Denmark,509.398375,1.735002
518,2006,New Zealand,521.988849,2.389559
466,2009,Mongolia,—,†
384,2012,Trinidad and Tobago,—,†
377,2012,Qatar,376.448398,0.755736


In [6]:
PISA_r.sample(5)

Unnamed: 0,year,jurisdiction,average,standard_error
312,2012,Ireland,523.17321,2.550366
88,2022,Singapore,542.553322,1.872188
521,2006,Portugal,472.304305,3.560561
395,2009,Austria,470.283632,2.948082
20,2022,Italy,481.598278,2.680228


In [7]:
PISA_sc.sample(5)

Unnamed: 0,year,jurisdiction,average,standard_error
749,2000,Jamaica,—,†
757,2000,Malaysia (2015),—,†
200,2015,Belgium,501.999714,2.289589
276,2015,Paraguay,—,†
537,2006,Bosnia and Herzegovina,—,†


---

Create new DataFrames, for each subject and year one DataFrame:

In [8]:
years = ['2000', '2003', '2006', '2009', '2012', '2015', '2018', '2022']

# Create a new list to safe the new DataFrames
new_dataframes = []

# Iterate over every year
for year in years:
    # Filter the row for every DataFrame for each year
    PISA_m_year = PISA_m[PISA_m['year'] == year].copy()
    PISA_r_year = PISA_r[PISA_r['year'] == year].copy()
    PISA_sc_year = PISA_sc[PISA_sc['year'] == year].copy()
    
    # Add the new DataFrames to the list
    new_dataframes.extend([PISA_m_year, PISA_r_year, PISA_sc_year])

# Save the newly created DataFrames in new Variables
PISA_m_2000, PISA_r_2000, PISA_sc_2000 = new_dataframes[:3]
PISA_m_2003, PISA_r_2003, PISA_sc_2003 = new_dataframes[3:6]
PISA_m_2006, PISA_r_2006, PISA_sc_2006 = new_dataframes[6:9]
PISA_m_2009, PISA_r_2009, PISA_sc_2009 = new_dataframes[9:12]
PISA_m_2012, PISA_r_2012, PISA_sc_2012 = new_dataframes[12:15]
PISA_m_2015, PISA_r_2015, PISA_sc_2015 = new_dataframes[15:18]
PISA_m_2018, PISA_r_2018, PISA_sc_2018 = new_dataframes[18:21]
PISA_m_2022, PISA_r_2022, PISA_sc_2022 = new_dataframes[21:]

In [9]:
dataframes = [PISA_m_2000, PISA_r_2000, PISA_sc_2000,
              PISA_m_2003, PISA_r_2003, PISA_sc_2003,
              PISA_m_2006, PISA_r_2006, PISA_sc_2006,
              PISA_m_2009, PISA_r_2009, PISA_sc_2009,
              PISA_m_2012, PISA_r_2012, PISA_sc_2012,
              PISA_m_2015, PISA_r_2015, PISA_sc_2015,
              PISA_m_2018, PISA_r_2018, PISA_sc_2018,
              PISA_m_2022, PISA_r_2022, PISA_sc_2022]

for df in dataframes:
    df.reset_index(inplace=True, drop=True)

Check that only one year is saved in each DataFrame:

In [11]:
print(PISA_m_2000['year'].unique(), 
      PISA_m_2003['year'].unique(), 
      PISA_m_2006['year'].unique(), 
      PISA_m_2009['year'].unique(), 
      PISA_m_2012['year'].unique(), 
      PISA_m_2015['year'].unique(), 
      PISA_m_2018['year'].unique(), 
      PISA_m_2022['year'].unique())

['2000'] ['2003'] ['2006'] ['2009'] ['2012'] ['2015'] ['2018'] ['2022']


In [12]:
print(PISA_r_2000['year'].unique(),
    PISA_r_2003['year'].unique(),
    PISA_r_2006['year'].unique(),
    PISA_r_2009['year'].unique(),
    PISA_r_2012['year'].unique(),
    PISA_r_2015['year'].unique(),
    PISA_r_2018['year'].unique(),
    PISA_r_2022['year'].unique())

['2000'] ['2003'] ['2006'] ['2009'] ['2012'] ['2015'] ['2018'] ['2022']


In [13]:
print(PISA_sc_2000['year'].unique(),
    PISA_sc_2003['year'].unique(),
    PISA_sc_2006['year'].unique(),
    PISA_sc_2009['year'].unique(),
    PISA_sc_2012['year'].unique(),
    PISA_sc_2015['year'].unique(),
    PISA_sc_2018['year'].unique(),
    PISA_sc_2022['year'].unique())

['2000'] ['2003'] ['2006'] ['2009'] ['2012'] ['2015'] ['2018'] ['2022']


---

Change the types for two columns and round the numbers:

In [14]:
dataframes = [PISA_m_2000, PISA_r_2000, PISA_sc_2000,
              PISA_m_2003, PISA_r_2003, PISA_sc_2003,
              PISA_m_2006, PISA_r_2006, PISA_sc_2006,
              PISA_m_2009, PISA_r_2009, PISA_sc_2009,
              PISA_m_2012, PISA_r_2012, PISA_sc_2012,
              PISA_m_2015, PISA_r_2015, PISA_sc_2015,
              PISA_m_2018, PISA_r_2018, PISA_sc_2018,
              PISA_m_2022, PISA_r_2022, PISA_sc_2022]

for df in dataframes:
    df['average'] = df['average'].apply(pd.to_numeric, errors='coerce')
    df['average'] = df['average'].round(2)

    df['standard_error'] = df['standard_error'].apply(pd.to_numeric, errors='coerce')
    df['standard_error'] = df['standard_error'].round(2)

In [15]:
PISA_m_2006.sample(5)

Unnamed: 0,year,jurisdiction,average,standard_error
21,2006,Japan,523.1,3.34
26,2006,Mexico,405.65,2.93
11,2006,Estonia,514.58,2.75
66,2006,Kazakhstan (2015),,
52,2006,Bulgaria,413.45,6.13


---