In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [6]:
df = pd.read_csv("./data/StudentsPerformance.csv")

In [12]:
numerical_columns = [
                "writing_score",
                "reading_score"
            ]
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_columns = [
    "gender",
    "race_ethnicity",
    "parental_level_of_education",
    "lunch",
    "test_preparation_course"
]
cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder())
    ]
)

preprocessor = ColumnTransformer(
    [
        ("num_pipeline", num_pipeline, numerical_columns),
        ("cat_pipeline", cat_pipeline, categorical_columns)
    ]
)

In [13]:
input_feature_train_df = df.drop(columns=["math_score"], axis=1)

In [14]:
input_feature_train_df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [15]:
preprocessor.fit_transform(input_feature_train_df)

array([[ 0.39149181,  0.19399858,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.31326868,  1.42747598,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.64247471,  1.77010859,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.20107904,  0.12547206,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.58901542,  0.60515772,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.18158627,  1.15336989,  1.        , ...,  0.        ,
         0.        ,  1.        ]], shape=(1000, 19))