In [None]:
import pandas as pd
import numpy as np
import python_functions_sp as pf
import warnings
warnings.filterwarnings("ignore")
import sqlalchemy
import psycopg2
import sql_functions_sp as sf

# Import

In [None]:
PISA_m = pd.read_excel('data/2000-2022_m_r.xls', sheet_name='mathematics', header=11)
PISA_r = pd.read_excel('data/2000-2022_m_r.xls', sheet_name='reading', header=11)
PISA_sc = pd.read_excel('data/2000-2022_sc.xls', sheet_name='science', header=11)

In [None]:
PISA_m.head()

In [None]:
PISA_r.head()

In [None]:
PISA_sc.head()

In [None]:
PISA_m = PISA_m.drop('Unnamed: 0', axis=1)
PISA_r = PISA_r.drop('Unnamed: 0', axis=1)
PISA_sc = PISA_sc.drop('Unnamed: 0', axis=1)

---

# Put column names in lower case, snake case and rename

In [None]:
dataframes = [PISA_m, PISA_r, PISA_sc]

pf.columns_lower_snake_case(dataframes)

In [None]:
PISA_m.sample(5)

In [None]:
PISA_r.sample(5)

In [None]:
PISA_sc.sample(5)

## Change the types for two columns and round the numbers

In [None]:
dataframes = [PISA_m, PISA_r, PISA_sc]

for df in dataframes:
    df['average'] = df['average'].apply(pd.to_numeric, errors='coerce')
    df['average'] = df['average'].round(2)

    df['standard_error'] = df['standard_error'].apply(pd.to_numeric, errors='coerce')
    df['standard_error'] = df['standard_error'].round(2)

In [None]:
PISA_m.sample(5)

In [None]:
PISA_r.sample(5)

In [None]:
PISA_sc.sample(5)

---

# Exploratory Data Analysis - EDA
- understand the data
- drop unneeded columns
- duplicates
- missing values
- descriptive statistics
- extreme values / outliers

In [None]:
PISA_m.info()

In [None]:
PISA_r.info()

In [None]:
PISA_sc.info()

## Check for duplicates

In [None]:
PISA_m.duplicated().value_counts()

In [None]:
PISA_r.duplicated().value_counts()

In [None]:
PISA_sc.duplicated().value_counts()

## Check for missing values

In [None]:
PISA_m.isnull().sum()

In [None]:
PISA_r.isnull().sum()

In [None]:
PISA_sc.isnull().sum()

In [None]:
PISA_m = PISA_m.tail(-2)
PISA_m = PISA_m.head(-6)

PISA_r = PISA_r.tail(-2)
PISA_r = PISA_r.head(-6)

PISA_sc = PISA_sc.tail(-2)
PISA_sc = PISA_sc.head(-6)

In [None]:
PISA_m

In [None]:
PISA_r

In [None]:
PISA_sc

---

## Change format of year column to datetime

In [None]:
PISA_r.info()

In [None]:
pf.year_to_int(PISA_m)
pf.year_to_int(PISA_r)
pf.year_to_int(PISA_sc)

In [None]:
print(PISA_m.info())
print(PISA_r.info())
print(PISA_sc.info())

---

## Do some descriptive statistics

In [None]:
PISA_m.describe()

In [None]:
PISA_r.describe()

In [None]:
PISA_sc.describe()

---

# Create new DataFrames, for each subject and year one DataFrame

In [None]:
years = [2000, 2003, 2006, 2009, 2012, 2015, 2018, 2022]

# Create a new list to safe the new DataFrames
new_dataframes = []

# Iterate over every year
for year in years:
    # Filter the row for every DataFrame for each year
    PISA_m_year = PISA_m[PISA_m['year'] == year].copy()
    PISA_r_year = PISA_r[PISA_r['year'] == year].copy()
    PISA_sc_year = PISA_sc[PISA_sc['year'] == year].copy()
    
    # Add the new DataFrames to the list
    new_dataframes.extend([PISA_m_year, PISA_r_year, PISA_sc_year])

# Save the newly created DataFrames in new Variables
PISA_m_2000, PISA_r_2000, PISA_sc_2000 = new_dataframes[:3]
PISA_m_2003, PISA_r_2003, PISA_sc_2003 = new_dataframes[3:6]
PISA_m_2006, PISA_r_2006, PISA_sc_2006 = new_dataframes[6:9]
PISA_m_2009, PISA_r_2009, PISA_sc_2009 = new_dataframes[9:12]
PISA_m_2012, PISA_r_2012, PISA_sc_2012 = new_dataframes[12:15]
PISA_m_2015, PISA_r_2015, PISA_sc_2015 = new_dataframes[15:18]
PISA_m_2018, PISA_r_2018, PISA_sc_2018 = new_dataframes[18:21]
PISA_m_2022, PISA_r_2022, PISA_sc_2022 = new_dataframes[21:]

In [None]:
dataframes = [PISA_m_2000, PISA_r_2000, PISA_sc_2000,
              PISA_m_2003, PISA_r_2003, PISA_sc_2003,
              PISA_m_2006, PISA_r_2006, PISA_sc_2006,
              PISA_m_2009, PISA_r_2009, PISA_sc_2009,
              PISA_m_2012, PISA_r_2012, PISA_sc_2012,
              PISA_m_2015, PISA_r_2015, PISA_sc_2015,
              PISA_m_2018, PISA_r_2018, PISA_sc_2018,
              PISA_m_2022, PISA_r_2022, PISA_sc_2022]

for df in dataframes:
    df.reset_index(inplace=True, drop=True)

In [None]:
PISA_m_2000

## Check that only one year is saved in each DataFrame

In [None]:
print(PISA_m_2000['year'].unique(), 
      PISA_m_2003['year'].unique(), 
      PISA_m_2006['year'].unique(), 
      PISA_m_2009['year'].unique(), 
      PISA_m_2012['year'].unique(), 
      PISA_m_2015['year'].unique(), 
      PISA_m_2018['year'].unique(), 
      PISA_m_2022['year'].unique())

In [None]:
print(PISA_r_2000['year'].unique(),
    PISA_r_2003['year'].unique(),
    PISA_r_2006['year'].unique(),
    PISA_r_2009['year'].unique(),
    PISA_r_2012['year'].unique(),
    PISA_r_2015['year'].unique(),
    PISA_r_2018['year'].unique(),
    PISA_r_2022['year'].unique())

In [None]:
print(PISA_sc_2000['year'].unique(),
    PISA_sc_2003['year'].unique(),
    PISA_sc_2006['year'].unique(),
    PISA_sc_2009['year'].unique(),
    PISA_sc_2012['year'].unique(),
    PISA_sc_2015['year'].unique(),
    PISA_sc_2018['year'].unique(),
    PISA_sc_2022['year'].unique())

---

# Save the DataFrames (pisa_x)
## Export tables to database

In [None]:
sf.get_engine()

In [None]:
# dataframes = [PISA_m_2000, PISA_r_2000, PISA_sc_2000,
#               PISA_m_2003, PISA_r_2003, PISA_sc_2003,
#               PISA_m_2006, PISA_r_2006, PISA_sc_2006,
#               PISA_m_2009, PISA_r_2009, PISA_sc_2009,
#               PISA_m_2012, PISA_r_2012, PISA_sc_2012,
#               PISA_m_2015, PISA_r_2015, PISA_sc_2015,
#               PISA_m_2018, PISA_r_2018, PISA_sc_2018,
#               PISA_m_2022, PISA_r_2022, PISA_sc_2022]

# table_names = ['pisa_m_2000', 'pisa_r_2000', 'pisa_sc_2000',
#               'pisa_m_2003', 'pisa_r_2003', 'pisa_sc_2003',
#               'pisa_m_2006', 'pisa_r_2006', 'pisa_sc_2006',
#               'pisa_m_2009', 'pisa_r_2009', 'pisa_sc_2009',
#               'pisa_m_2012', 'pisa_r_2012', 'pisa_sc_2012',
#               'pisa_m_2015', 'pisa_r_2015', 'pisa_sc_2015',
#               'pisa_m_2018', 'pisa_r_2018', 'pisa_sc_2018',
#               'pisa_m_2022', 'pisa_r_2022', 'pisa_sc_2022']

# for df, table_name in zip(dataframes, table_names):
#     sf.push_to_cloud(df, table_name)

One can also create a dictionary to name the DataFrames - but pay attention: the DataFrame has to be the value and can't be the key!

In [None]:
test = {'name1': 'df1',
        'name2': 'df2',
        'name3': 'df3'}

for table_name, df in test.items():
    testi = f'{table_name}_pivot'
    print(df, testi)

In [None]:
# sf.push_to_cloud(PISA_m, 'pisa_m')
# sf.push_to_cloud(PISA_r, 'pisa_r')
# sf.push_to_cloud(PISA_sc, 'pisa_sc')

## Store DataFrames

In [None]:
pisa_m = PISA_m
pisa_r = PISA_r
pisa_sc = PISA_sc

%store pisa_m
%store pisa_r
%store pisa_sc

---

-- Extra Credit: Change the types for two columns and round the numbers (doppel gemoppelt - we did it at the beginning)

In [None]:
# dataframes = [PISA_m_2000, PISA_r_2000, PISA_sc_2000,
#               PISA_m_2003, PISA_r_2003, PISA_sc_2003,
#               PISA_m_2006, PISA_r_2006, PISA_sc_2006,
#               PISA_m_2009, PISA_r_2009, PISA_sc_2009,
#               PISA_m_2012, PISA_r_2012, PISA_sc_2012,
#               PISA_m_2015, PISA_r_2015, PISA_sc_2015,
#               PISA_m_2018, PISA_r_2018, PISA_sc_2018,
#               PISA_m_2022, PISA_r_2022, PISA_sc_2022]

# for df in dataframes:
#     df['average'] = df['average'].apply(pd.to_numeric, errors='coerce')
#     df['average'] = df['average'].round(2)

#     df['standard_error'] = df['standard_error'].apply(pd.to_numeric, errors='coerce')
#     df['standard_error'] = df['standard_error'].round(2)

Two other ways to do it:

In [None]:
# dataframes = [PISA_m_2000, PISA_r_2000, PISA_sc_2000,
#               PISA_m_2003, PISA_r_2003, PISA_sc_2003,
#               PISA_m_2006, PISA_r_2006, PISA_sc_2006,
#               PISA_m_2009, PISA_r_2009, PISA_sc_2009,
#               PISA_m_2012, PISA_r_2012, PISA_sc_2012,
#               PISA_m_2015, PISA_r_2015, PISA_sc_2015,
#               PISA_m_2018, PISA_r_2018, PISA_sc_2018,
#               PISA_m_2022, PISA_r_2022, PISA_sc_2022]

# for df in dataframes:
#     df['average'] = df['average'].astype(str).replace({'—': np.nan, '‡': np.nan, '†': np.nan})
#     df['average'] = df['average'].astype(float)
#     df['average'] = df['average'].round(2)

#     df['standard_error'] = df['standard_error'].astype(str).replace({'—': np.nan, '‡': np.nan, '†': np.nan})
#     df['standard_error'] = df['standard_error'].astype(float)
#     df['standard_error'] = df['standard_error'].round(2)

In [None]:
# df['average'] = df['average'].astype('str')
# df['average'] = df['average'].str.replace('-', '0')
# df['average'] = df['average'].astype

---
---

# Playground

In [None]:
PISA_m['year'].unique()

In [None]:
PISA_m['jurisdiction'].unique()

---
---

In [None]:
import python_functions_sp as pfsp

# Splitting the countries into the continents
- northern america
- southern america
- europe (4 parts for further analysis)
- africa
- asia
- pacific
- middle-east

Creating a copy with a meaningful name:

In [None]:
PISA_m_continents = PISA_m.copy()
PISA_r_continents = PISA_r.copy()
PISA_sc_continents = PISA_sc.copy()

In [None]:
PISA_m_continents.rename(columns={'jurisdiction': 'country'}, inplace=True)
PISA_r_continents.rename(columns={'jurisdiction': 'country'}, inplace=True)
PISA_sc_continents.rename(columns={'jurisdiction': 'country'}, inplace=True)

Check if it worked:

In [None]:
PISA_m_continents.head()

In [None]:
PISA_r_continents.head()

In [None]:
PISA_sc_continents.head()

Assigning the continent to the countries in a new created column:

In [None]:
PISA_m_continents['continent'] = PISA_m_continents['country'].apply(pfsp.assign_continent)
PISA_r_continents['continent'] = PISA_r_continents['country'].apply(pfsp.assign_continent)
PISA_sc_continents['continent'] = PISA_sc_continents['country'].apply(pfsp.assign_continent)

Assigning the four parts of europe to the countries in another new created column:

In [None]:
PISA_m_continents['continent_region'] = PISA_m_continents['country'].apply(pfsp.assign_europe_region)
PISA_r_continents['continent_region'] = PISA_r_continents['country'].apply(pfsp.assign_europe_region)
PISA_sc_continents['continent_region'] = PISA_sc_continents['country'].apply(pfsp.assign_europe_region)

Changing the order of the columns:

In [None]:
columns = ['country', 'continent', 'continent_region', 'year', 'average', 'standard_error']

PISA_m_continents = PISA_m_continents[columns]
PISA_r_continents = PISA_r_continents[columns]
PISA_sc_continents = PISA_sc_continents[columns]

Checking the result:

In [None]:
PISA_m_continents.sample(10)

In [None]:
PISA_r_continents.sample(10)

In [None]:
PISA_sc_continents.sample(10)

Check if some countries couldn't be assigned to a continent:

In [None]:
PISA_m_continents[PISA_m_continents['continent'] == 'Other']['country'].unique()

In [None]:
PISA_m_continents[PISA_m_continents['continent_region'] == 'Other']['country'].unique()

In [None]:
PISA_r_continents[PISA_r_continents['continent'] == 'Other']['country'].unique()

In [None]:
PISA_r_continents[PISA_r_continents['continent_region'] == 'Other']['country'].unique()

In [None]:
PISA_sc_continents[PISA_sc_continents['continent'] == 'Other']['country'].unique()

In [None]:
PISA_sc_continents[PISA_sc_continents['continent_region'] == 'Other']['country'].unique()

---

Check if the new created columns match:

In [None]:
pfsp.check_values(PISA_m_continents)
pfsp.check_values(PISA_r_continents)
pfsp.check_values(PISA_sc_continents)

---

# Saving pisa_x_continents

## Export tables to database

In [None]:
# sf.push_to_cloud(PISA_m_continents, 'pisa_m_continents')
# sf.push_to_cloud(PISA_r_continents, 'pisa_r_continents')
# sf.push_to_cloud(PISA_sc_continents, 'pisa_sc_continents')

## Storing the DataFrames

In [None]:
pisa_m_continents = PISA_m_continents
pisa_r_continents = PISA_r_continents
pisa_sc_continents = PISA_sc_continents

%store pisa_m_continents
%store pisa_r_continents
%store pisa_sc_continents

---
---

In [None]:
PISA_m_continents.sample(10)

In [None]:
PISA_r_continents.sample(10)

In [None]:
PISA_sc_continents.sample(10)