In [133]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import pandas as pd

In [134]:
X, y = fetch_openml("titanic", version=1, return_X_y=True)
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [135]:
X = pd.DataFrame(X)
X.drop(columns='name', inplace=True)

In [136]:
# one hot encoding
onehot = OneHotEncoder(sparse_output=False,handle_unknown="ignore")
sex_encoded = onehot.fit_transform(X[['sex']])
sex_columns = onehot.get_feature_names_out(['sex'])
sex_df = pd.DataFrame(sex_encoded, columns=sex_columns)
X_one_hot = pd.concat([X, sex_df], axis=1)
X_one_hot.drop(columns='sex', inplace=True)
X_one_hot['sex_female'] = X_one_hot['sex_female'].astype(int) 
X_one_hot['sex_male'] = X_one_hot['sex_male'].astype(int) 


In [138]:
# bining for categorizing data
bins = [0, 12, 18, 35, 60, 120]
labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
values = [0,1,2,3,4]
X['age cat'] = pd.cut(X['age'], bins=bins, labels=values, right=False)

X.head(30)

Unnamed: 0,pclass,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,age cat
0,1,female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",2.0
1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",0.0
2,1,female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.0
3,1,male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",2.0
4,1,female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",2.0
5,1,male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY",3.0
6,1,female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY",4.0
7,1,male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI",3.0
8,1,female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY",3.0
9,1,male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay",4.0


In [148]:
X['cabin'] = X['cabin'].fillna("M")
X['cabin'].unique()

array(['B5', 'C22 C26', 'E12', 'D7', 'A36', 'C101', 'M', 'C62 C64', 'B35',
       'A23', 'B58 B60', 'D15', 'C6', 'D35', 'C148', 'C97', 'B49', 'C99',
       'C52', 'T', 'A31', 'C7', 'C103', 'D22', 'E33', 'A21', 'B10', 'B4',
       'E40', 'B38', 'E24', 'B51 B53 B55', 'B96 B98', 'C46', 'E31', 'E8',
       'B61', 'B77', 'A9', 'C89', 'A14', 'E58', 'E49', 'E52', 'E45',
       'B22', 'B26', 'C85', 'E17', 'B71', 'B20', 'A34', 'C86', 'A16',
       'A20', 'A18', 'C54', 'C45', 'D20', 'A29', 'C95', 'E25', 'C111',
       'C23 C25 C27', 'E36', 'D34', 'D40', 'B39', 'B41', 'B102', 'C123',
       'E63', 'C130', 'B86', 'C92', 'A5', 'C51', 'B42', 'C91', 'C125',
       'D10 D12', 'B82 B84', 'E50', 'D33', 'C83', 'B94', 'D49', 'D45',
       'B69', 'B11', 'E46', 'C39', 'B18', 'D11', 'C93', 'B28', 'C49',
       'B52 B54 B56', 'E60', 'C132', 'B37', 'D21', 'D19', 'C124', 'D17',
       'B101', 'D28', 'D6', 'D9', 'B80', 'C106', 'B79', 'C47', 'D30',
       'C90', 'E38', 'C78', 'C30', 'C118', 'D36', 'D48', 'D47', '

In [158]:
# ordinal encoding
ordin_encoder = OrdinalEncoder(categories=[['B5', 'C22 C26', 'E12', 'D7', 'A36', 'C101', 'M', 'C62 C64', 'B35',
       'A23', 'B58 B60', 'D15', 'C6', 'D35', 'C148', 'C97', 'B49', 'C99',
       'C52', 'T', 'A31', 'C7', 'C103', 'D22', 'E33', 'A21', 'B10', 'B4',
       'E40', 'B38', 'E24', 'B51 B53 B55', 'B96 B98', 'C46', 'E31', 'E8',
       'B61', 'B77', 'A9', 'C89', 'A14', 'E58', 'E49', 'E52', 'E45',
       'B22', 'B26', 'C85', 'E17', 'B71', 'B20', 'A34', 'C86', 'A16',
       'A20', 'A18', 'C54', 'C45', 'D20', 'A29', 'C95', 'E25', 'C111',
       'C23 C25 C27', 'E36', 'D34', 'D40', 'B39', 'B41', 'B102', 'C123',
       'E63', 'C130', 'B86', 'C92', 'A5', 'C51', 'B42', 'C91', 'C125',
       'D10 D12', 'B82 B84', 'E50', 'D33', 'C83', 'B94', 'D49', 'D45',
       'B69', 'B11', 'E46', 'C39', 'B18', 'D11', 'C93', 'B28', 'C49',
       'B52 B54 B56', 'E60', 'C132', 'B37', 'D21', 'D19', 'C124', 'D17',
       'B101', 'D28', 'D6', 'D9', 'B80', 'C106', 'B79', 'C47', 'D30',
       'C90', 'E38', 'C78', 'C30', 'C118', 'D36', 'D48', 'D47', 'C105',
       'B36', 'B30', 'D43', 'B24', 'C2', 'C65', 'B73', 'C104', 'C110',
       'C50', 'B3', 'A24', 'A32', 'A11', 'A10', 'B57 B59 B63 B66', 'C28',
       'E44', 'A26', 'A6', 'A7', 'C31', 'A19', 'B45', 'E34', 'B78', 'B50',
       'C87', 'C116', 'C55 C57', 'D50', 'E68', 'E67', 'C126', 'C68',
       'C70', 'C53', 'B19', 'D46', 'D37', 'D26', 'C32', 'C80', 'C82',
       'C128', 'E39 E41', 'D', 'F4', 'D56', 'F33', 'E101', 'E77', 'F2',
       'D38', 'F', 'F G63', 'F E57', 'F E46', 'F G73', 'E121', 'F E69',
       'E10', 'G6', 'F38']])
cabin_encoded = ordin_encoder.fit_transform(X[['cabin']])
X['cabin encoded'] = cabin_encoded.astype(int)
X.head(10)

Unnamed: 0,pclass,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,age cat,cabin encoded
0,1,female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",2,0
1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",0,1
2,1,female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0,1
3,1,male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",2,1
4,1,female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",2,1
5,1,male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY",3,2
6,1,female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY",4,3
7,1,male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI",3,4
8,1,female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY",3,5
9,1,male,71.0,0,0,PC 17609,49.5042,M,C,,22.0,"Montevideo, Uruguay",4,6
