# Using numerical and categorical variables together
> We will show how to combine preprocessing steps on numerical and categorical
- toc: true
- badges: false
- comments: true
- author: Cécile Gallioz
- categories: [sklearn]

# Preparation

In [1]:
import pandas as pd
import time
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [2]:
myDataFrame = pd.read_csv("../../scikit-learn-mooc/datasets/adult-census.csv")

In [3]:
myDataFrame = myDataFrame.drop(columns="education-num")

In [4]:
target_column = 'class'
target = myDataFrame[target_column]

In [5]:
target.value_counts(normalize=True)

 <=50K    0.760718
 >50K     0.239282
Name: class, dtype: float64

In [6]:
data = myDataFrame.drop(columns=target_column)

In [7]:
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")

The dataset data contains 48842 samples and 12 features


In [8]:
data.dtypes

age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [9]:
numerical_columns = selector(dtype_exclude=object)(data)

In [10]:
categorical_columns = selector(dtype_include=object)(data)

In [11]:
all_columns = numerical_columns + categorical_columns
data = data[all_columns]

In [12]:
print(f"The dataset data contains {data.shape[0]} samples and {data.shape[1]} features")

The dataset data contains 48842 samples and 12 features


In [13]:
data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]

# Numerical
## Normalization + Regression

In [14]:
model_normLin = make_pipeline(
    StandardScaler(), 
    LogisticRegression())

In [15]:
cv_results_normLin = cross_validate(model_normLin, data_numerical, target, cv=10)

In [16]:
scores = cv_results_normLin["test_score"]
fit_time = cv_results_normLin["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy is 0.800 +/- 0.004, for 0.076 seconds


# Categorical
## Hot encoding + Regression = good

In [17]:
model_oneHotLin = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"), 
    LogisticRegression(max_iter=500)
)

In [18]:
cv_results_oneHotLin = cross_validate(model_oneHotLin, data_categorical, target, cv=10)

In [19]:
scores = cv_results_oneHotLin["test_score"]
fit_time = cv_results_oneHotLin["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy is 0.833 +/- 0.003, for 0.749 seconds


## Ordinal encoding + Regression = not good

In [20]:
model_ordLin = make_pipeline(
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), 
    LogisticRegression(max_iter=500)
)

In [21]:
cv_results_ordLin = cross_validate(model_ordLin, data_categorical, target, cv=10)

In [22]:
scores = cv_results_ordLin["test_score"]
fit_time = cv_results_ordLin["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy is 0.755 +/- 0.002, for 0.379 seconds


# Numerical & Categorical
## Normalize + Hot encoding + Linear Regression = good

In [23]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [24]:
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard-scaler', numerical_preprocessor, numerical_columns)])

In [25]:
model_both = make_pipeline(
    preprocessor, 
    LogisticRegression(max_iter=1500))

In [26]:
cv_results_both = cross_validate(model_both, data, target, cv=10)

In [27]:
scores = cv_results_both["test_score"]
fit_time = cv_results_both["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy is 0.851 +/- 0.003, for 1.153 seconds


## Ordinal encoding + Gradient-boosting trees = best

In [28]:
categorical_preprocessor_tree = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)

In [29]:
preprocessor_tree = ColumnTransformer([
    ('categorical', categorical_preprocessor_tree, categorical_columns)], remainder="passthrough")

In [30]:
model_tree = make_pipeline(preprocessor_tree, HistGradientBoostingClassifier())

In [31]:
cv_results_tree = cross_validate(model_tree, data, target, cv=10)

In [32]:
scores = cv_results_tree["test_score"]
fit_time = cv_results_tree["fit_time"]
print("The accuracy is "
      f"{scores.mean():.3f} +/- {scores.std():.3f}, for {fit_time.mean():.3f} seconds")

The accuracy is 0.874 +/- 0.003, for 1.811 seconds
