In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df=df[["pclass", "sibsp", "parch", "sex", "embarked","class", "survived","deck"]];df

Unnamed: 0,pclass,sibsp,parch,sex,embarked,class,survived,deck
0,3,1,0,male,S,Third,0,
1,1,1,0,female,C,First,1,C
2,3,0,0,female,S,Third,1,
3,1,1,0,female,S,First,1,C
4,3,0,0,male,S,Third,0,
...,...,...,...,...,...,...,...,...
886,2,0,0,male,S,Second,0,
887,1,0,0,female,S,First,1,B
888,3,1,2,female,S,Third,0,
889,1,0,0,male,C,First,1,C


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("survived", axis=1),  # predictors
    df["survived"],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0,
)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((623, 7), (268, 7))

In [5]:
X_train.isnull().sum()

pclass        0
sibsp         0
parch         0
sex           0
embarked      2
class         0
deck        472
dtype: int64

In [6]:
# X_train.fillna("Missing", inplace=True)
# X_test.fillna("Missing", inplace=True)

In [9]:
encoder = OneHotEncoder(categories="auto",
                        drop="first",
                        sparse_output=False,
                        handle_unknown="infrequent_if_exist").set_output(transform="pandas")

In [10]:
encoder.fit(X_train)

In [11]:
encoder.categories_

[array([1, 2, 3], dtype=int64),
 array([0, 1, 2, 3, 4, 5, 8], dtype=int64),
 array([0, 1, 2, 3, 4, 5, 6], dtype=int64),
 array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S', nan], dtype=object),
 array(['First', 'Second', 'Third'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', nan], dtype=object)]

In [12]:
encoder.transform(X_train)

Unnamed: 0,pclass_2,pclass_3,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,sibsp_8,parch_1,parch_2,...,embarked_nan,class_Second,class_Third,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_nan
857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
52,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
386,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
578,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
192,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
629,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
559,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
ct = ColumnTransformer(
    transformers=[
        ("encoder",encoder,["sex","embarked","class","deck"])
    ],remainder="passthrough"
).set_output(transform="pandas")
ct.fit(X_train)

In [14]:
ct.transform(X_train)

Unnamed: 0,encoder__sex_male,encoder__embarked_Q,encoder__embarked_S,encoder__embarked_nan,encoder__class_Second,encoder__class_Third,encoder__deck_B,encoder__deck_C,encoder__deck_D,encoder__deck_E,encoder__deck_F,encoder__deck_G,encoder__deck_nan,remainder__pclass,remainder__sibsp,remainder__parch
857,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0
52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,1,0
386,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,5,2
124,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,1
578,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1,1
192,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,1,0
629,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,0,0
559,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,1,0


In [15]:
X_test.isnull().sum()

pclass        0
sibsp         0
parch         0
sex           0
embarked      0
class         0
deck        216
dtype: int64

In [16]:
ct.transform(X_test)

Unnamed: 0,encoder__sex_male,encoder__embarked_Q,encoder__embarked_S,encoder__embarked_nan,encoder__class_Second,encoder__class_Third,encoder__deck_B,encoder__deck_C,encoder__deck_D,encoder__deck_E,encoder__deck_F,encoder__deck_G,encoder__deck_nan,remainder__pclass,remainder__sibsp,remainder__parch
495,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,0,0
648,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,0,0
278,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,4,1
31,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0
255,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
718,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,0,0
620,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,1,0
786,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,0,0


In [17]:
encoder.transform(X_test)["deck_nan"].sum()

216.0

In [18]:
X_train.isnull().sum()

pclass        0
sibsp         0
parch         0
sex           0
embarked      2
class         0
deck        472
dtype: int64

In [19]:
encoder = OneHotEncoder(categories="auto",
                        drop="first",
                        sparse_output=False,
                        handle_unknown="infrequent_if_exist").set_output(transform="pandas")

In [22]:
pipe = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant",fill_value="missing")),
    ("ohe",encoder)
]).set_output(transform="pandas")

In [23]:
pipe.fit_transform(X_train[["sex","embarked","class","deck"]])

Unnamed: 0,sex_male,embarked_Q,embarked_S,embarked_missing,class_Second,class_Third,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_missing
857,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
386,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
124,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
578,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
192,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
629,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
559,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
ct = ColumnTransformer(transformers=[
    ("encoder",pipe,["sex","embarked","deck"])
],remainder="passthrough").set_output(transform="pandas")

ct.fit_transform(X_train)

Unnamed: 0,encoder__sex_male,encoder__embarked_Q,encoder__embarked_S,encoder__embarked_missing,encoder__deck_B,encoder__deck_C,encoder__deck_D,encoder__deck_E,encoder__deck_F,encoder__deck_G,encoder__deck_missing,remainder__pclass,remainder__sibsp,remainder__parch,remainder__class
857,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0,First
52,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,1,0,First
386,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,5,2,Third
124,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,1,First
578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,1,0,Third
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1,1,First
192,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,1,0,Third
629,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,0,0,Third
559,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,1,0,Third


In [25]:
X_train.isnull().sum()

pclass        0
sibsp         0
parch         0
sex           0
embarked      2
class         0
deck        472
dtype: int64

In [27]:
from feature_engine.encoding import OneHotEncoder
ohe = OneHotEncoder(top_categories=None,
                    variables=["sex","class"],
                    drop_last=True)

ohe.fit_transform(X_train)

Unnamed: 0,pclass,sibsp,parch,embarked,deck,sex_male,class_First,class_Third
857,1,0,0,S,E,1,1,0
52,1,1,0,C,D,0,1,0
386,3,5,2,S,,1,0,1
124,1,0,1,S,D,1,1,0
578,3,1,0,C,,0,0,1
...,...,...,...,...,...,...,...,...
835,1,1,1,C,E,0,1,0
192,3,1,0,S,,0,0,1
629,3,0,0,Q,,1,0,1
559,3,1,0,S,,0,0,1


In [28]:
ohe = OneHotEncoder(top_categories=None,
                    variables=["sex","class"],
                    drop_last=True)

In [29]:
pipe = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant",fill_value="missing")),
    ("encoding",ohe)
]).set_output(transform="pandas")

In [34]:
import numpy as np

In [38]:
X_train.loc[X_train.pclass==1,"pclass"]= np.nan

In [39]:
X_train.loc[X_train.pclass==1]["pclass"]

Series([], Name: pclass, dtype: float64)

In [40]:
pipe.fit_transform(X_train)

Unnamed: 0,pclass,sibsp,parch,embarked,deck,sex_male,class_First,class_Third
857,missing,0,0,S,E,1,1,0
52,missing,1,0,C,D,0,1,0
386,3.0,5,2,S,missing,1,0,1
124,missing,0,1,S,D,1,1,0
578,3.0,1,0,C,missing,0,0,1
...,...,...,...,...,...,...,...,...
835,missing,1,1,C,E,0,1,0
192,3.0,1,0,S,missing,0,0,1
629,3.0,0,0,Q,missing,1,0,1
559,3.0,1,0,S,missing,0,0,1


In [41]:
pd.get_dummies(X_train["deck"],dummy_na=True).astype(int)

Unnamed: 0,A,B,C,D,E,F,G,NaN
857,0,0,0,0,1,0,0,0
52,0,0,0,1,0,0,0,0
386,0,0,0,0,0,0,0,1
124,0,0,0,1,0,0,0,0
578,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...
835,0,0,0,0,1,0,0,0
192,0,0,0,0,0,0,0,1
629,0,0,0,0,0,0,0,1
559,0,0,0,0,0,0,0,1


In [42]:
from category_encoders.one_hot import OneHotEncoder
ohe_c = OneHotEncoder(
    cols=["sex","deck"],
    use_cat_names=True
)

In [43]:
ohe_c.fit_transform(X_train)

Unnamed: 0,pclass,sibsp,parch,sex_male,sex_female,embarked,class,deck_E,deck_D,deck_nan,deck_B,deck_C,deck_A,deck_F,deck_G
857,,0,0,1,0,S,First,1,0,0,0,0,0,0,0
52,,1,0,0,1,C,First,0,1,0,0,0,0,0,0
386,3.0,5,2,1,0,S,Third,0,0,1,0,0,0,0,0
124,,0,1,1,0,S,First,0,1,0,0,0,0,0,0
578,3.0,1,0,0,1,C,Third,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,,1,1,0,1,C,First,1,0,0,0,0,0,0,0
192,3.0,1,0,0,1,S,Third,0,0,1,0,0,0,0,0
629,3.0,0,0,1,0,Q,Third,0,0,1,0,0,0,0,0
559,3.0,1,0,0,1,S,Third,0,0,1,0,0,0,0,0


In [44]:
ohe_c = OneHotEncoder(
    use_cat_names=True
)
ohe_c.fit_transform(X_train)

Unnamed: 0,pclass,sibsp,parch,sex_male,sex_female,embarked_S,embarked_C,embarked_Q,embarked_nan,class_First,class_Third,class_Second,deck_E,deck_D,deck_nan,deck_B,deck_C,deck_A,deck_F,deck_G
857,,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0
52,,1,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0
386,3.0,5,2,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0
124,,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0
578,3.0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,,1,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0
192,3.0,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0
629,3.0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0
559,3.0,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0


In [45]:
ohe_c.mapping

[{'col': 'sex',
  'mapping':     sex_male  sex_female
   1         1           0
   2         0           1
  -1         0           0
  -2         0           0},
 {'col': 'embarked',
  'mapping':     embarked_S  embarked_C  embarked_Q  embarked_nan
   1           1           0           0             0
   2           0           1           0             0
   3           0           0           1             0
   4           0           0           0             1
  -1           0           0           0             0
  -2           0           0           0             0},
 {'col': 'class',
  'mapping':     class_First  class_Third  class_Second
   1            1            0             0
   2            0            1             0
   3            0            0             1
  -1            0            0             0
  -2            0            0             0},
 {'col': 'deck',
  'mapping':     deck_E  deck_D  deck_nan  deck_B  deck_C  deck_A  deck_F  deck_G
   1       1     

In [46]:
ohe_c.get_feature_names_out()

['pclass',
 'sibsp',
 'parch',
 'sex_male',
 'sex_female',
 'embarked_S',
 'embarked_C',
 'embarked_Q',
 'embarked_nan',
 'class_First',
 'class_Third',
 'class_Second',
 'deck_E',
 'deck_D',
 'deck_nan',
 'deck_B',
 'deck_C',
 'deck_A',
 'deck_F',
 'deck_G']

In [47]:
from sklearn.preprocessing import OrdinalEncoder

In [48]:
ord = OrdinalEncoder(encoded_missing_value=-1,unknown_value=-2,handle_unknown="use_encoded_value").set_output(transform="pandas")

In [50]:
ord.fit_transform(X_train[["deck"]])

Unnamed: 0,deck
857,4.0
52,3.0
386,-1.0
124,3.0
578,-1.0
...,...
835,4.0
192,-1.0
629,-1.0
559,-1.0


In [53]:
X_test.isnull().sum()

pclass        0
sibsp         0
parch         0
sex           0
embarked      0
class         0
deck        216
dtype: int64

In [54]:
x_tst = X_test.copy()
x_tst["deck"] = x_tst["deck"].str.replace("B","missingg")

In [60]:
x_tst["deck"]

495         NaN
648         NaN
278         NaN
31     missingg
255         NaN
         ...   
263    missingg
718         NaN
620         NaN
786         NaN
64          NaN
Name: deck, Length: 268, dtype: object

In [55]:
x_tst.deck.value_counts()

deck
C           17
missingg    16
D            7
E            5
F            4
G            2
A            1
Name: count, dtype: int64

In [58]:
ord.transform(x_tst[["deck"]])

Unnamed: 0,deck
495,-1.0
648,-1.0
278,-1.0
31,-2.0
255,-1.0
...,...
263,-2.0
718,-1.0
620,-1.0
786,-1.0


In [56]:
ohe_c.fit(X_train)

In [61]:
ohe_c.transform(x_tst).columns[ohe_c.transform(x_tst).columns.str.contains("deck")]

Index(['deck_E', 'deck_D', 'deck_nan', 'deck_B', 'deck_C', 'deck_A', 'deck_F',
       'deck_G'],
      dtype='object')

In [77]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(categories="auto",
                       
                        sparse_output=False,
                        handle_unknown="infrequent_if_exist").set_output(transform="pandas")

In [78]:
X_train.isnull().sum()

pclass      156
sibsp         0
parch         0
sex           0
embarked      2
class         0
deck        472
dtype: int64

In [79]:
encoder.fit_transform(X_train[["deck"]])

Unnamed: 0,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_nan
857,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
52,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
124,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
835,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [80]:
X_train["deck"].value_counts()

deck
C    42
B    31
E    27
D    26
A    14
F     9
G     2
Name: count, dtype: int64

In [81]:
x_tst["deck"].value_counts()

deck
C           17
missingg    16
D            7
E            5
F            4
G            2
A            1
Name: count, dtype: int64

In [82]:
encoder.transform(x_tst[["deck"]])

Unnamed: 0,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_nan
495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
620,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [83]:
encoder.transform(x_tst[["deck"]])["deck_B"].value_counts()

deck_B
0.0    268
Name: count, dtype: int64