This notebook shows how to use sklearn pipelines for numerical and categorical variables.

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_selector

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('OECD PISA data 2.csv')

In [3]:
df['TIME'] = pd.to_datetime(df['TIME'].astype(str) + '-01-01')
df_pivot = df.pivot_table('Value', ['LOCATION', 'INDICATOR', 'TIME'], 'SUBJECT')
df_reset = df_pivot.reset_index()

In [4]:
numeric_cols = make_column_selector(dtype_include=np.number)
categoricals = make_column_selector(dtype_exclude=np.number)

In [5]:
# Define the numeric pipeline
numeric_pipeline = make_pipeline(
    StandardScaler(), SimpleImputer()
)

In [23]:
# Define the categorical pipeline
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(),
    StandardScaler(with_mean=False),
)

In [24]:

transformers = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, numeric_cols),
        ("categorical", categorical_pipeline, categoricals),
    ]
)

In [9]:
df_reset

SUBJECT,LOCATION,INDICATOR,TIME,BOY,GIRL,TOT
0,AUS,PISAMATH,2003-01-01,527.000,522.000,524.0
1,AUS,PISAMATH,2006-01-01,527.000,513.000,520.0
2,AUS,PISAMATH,2009-01-01,519.000,509.000,514.0
3,AUS,PISAMATH,2012-01-01,510.115,497.821,504.0
4,AUS,PISAMATH,2015-01-01,497.000,491.000,494.0
...,...,...,...,...,...,...
703,USA,PISASCIENCE,2006-01-01,489.000,489.000,489.0
704,USA,PISASCIENCE,2009-01-01,509.000,495.000,502.0
705,USA,PISASCIENCE,2012-01-01,496.529,498.325,497.0
706,USA,PISASCIENCE,2015-01-01,500.000,493.000,496.0


In [25]:
X = transformers.fit_transform(df_reset)

In [28]:
df.columns

Index(['index', 'LOCATION', 'INDICATOR', 'SUBJECT', 'TIME', 'Value'], dtype='object')

In [32]:
df_reset[df_reset['LOCATION'] == 'POL']

SUBJECT,LOCATION,INDICATOR,TIME,BOY,GIRL,TOT
564,POL,PISAMATH,2003-01-01,493.0,487.0,490.0
565,POL,PISAMATH,2006-01-01,500.0,491.0,495.0
566,POL,PISAMATH,2009-01-01,497.0,493.0,495.0
567,POL,PISAMATH,2012-01-01,519.564,515.533,518.0
568,POL,PISAMATH,2015-01-01,510.0,499.0,504.0
569,POL,PISAMATH,2018-01-01,516.0,515.0,516.0
570,POL,PISAREAD,2000-01-01,461.0,497.0,479.0
571,POL,PISAREAD,2003-01-01,477.0,516.0,497.0
572,POL,PISAREAD,2006-01-01,487.0,528.0,508.0
573,POL,PISAREAD,2009-01-01,476.0,525.0,500.0
