# Data Encoding

In [3]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.5-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
Downloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Downloading statsmodels-0.14.5-cp311-cp311-win_amd64.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.6 MB ? eta -:--:--
   --- ------------------------------------ 0.8/9.6 MB 1.6 MB/s eta 0:00:06
   ----- ---------------------------------- 1.3/9.6 MB 1.9 MB/s eta 0:00:05
   ------ ---------------------------------


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: C:\Users\ASUS\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split

In [5]:
data = sns.load_dataset("titanic")[['sex', 'class', 'embarked', 'survived']].dropna()
print("Dataset Shape:", data.shape)
print(data.head(10))

Dataset Shape: (889, 4)
      sex   class embarked  survived
0    male   Third        S         0
1  female   First        C         1
2  female   Third        S         1
3  female   First        S         1
4    male   Third        S         0
5    male   Third        Q         0
6    male   First        S         0
7    male   Third        S         0
8  female   Third        S         1
9  female  Second        C         1


## Label Encoding

In [6]:
le = LabelEncoder()
data['sex_label'] = le.fit_transform(data['sex'])
print("Label Encoded 'sex':")
print(data[['sex', 'sex_label']].head())


Label Encoded 'sex':
      sex  sex_label
0    male          1
1  female          0
2  female          0
3  female          0
4    male          1


## One-Hot Encoding (for nominal categorical)

In [7]:
ohe = OneHotEncoder(sparse_output=False)
embarked_encoded = ohe.fit_transform(data[['embarked']])
embarked_df = pd.DataFrame(embarked_encoded, columns=ohe.get_feature_names_out(['embarked']))
data_ohe = pd.concat([data.reset_index(drop=True), embarked_df], axis=1)

print("\nOne-Hot Encoded 'embarked':")
print(data_ohe.head())



One-Hot Encoded 'embarked':
      sex  class embarked  survived  sex_label  embarked_C  embarked_Q  \
0    male  Third        S         0          1         0.0         0.0   
1  female  First        C         1          0         1.0         0.0   
2  female  Third        S         1          0         0.0         0.0   
3  female  First        S         1          0         0.0         0.0   
4    male  Third        S         0          1         0.0         0.0   

   embarked_S  
0         1.0  
1         0.0  
2         1.0  
3         1.0  
4         1.0  


## Ordinal Encoding (for ordered categorical)

In [8]:
class_order = ['Third', 'Second', 'First']
ord_enc = OrdinalEncoder(categories=[class_order])
data['class_ordinal'] = ord_enc.fit_transform(data[['class']])

print("\nOrdinal Encoded 'class':")
print(data[['class', 'class_ordinal']].head())



Ordinal Encoded 'class':
   class  class_ordinal
0  Third            0.0
1  First            2.0
2  Third            0.0
3  First            2.0
4  Third            0.0


## Target Encoding

In [9]:
X = data[['embarked', 'class']]
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

te = TargetEncoder()
X_train_te = te.fit_transform(X_train, y_train)
X_test_te = te.transform(X_test)

print("\nTarget Encoded Data (Advanced Encoding):")
print("Train Encoded:\n", X_train_te.head())
print("Test Encoded:\n", X_test_te.head())



Target Encoded Data (Advanced Encoding):
Train Encoded:
      embarked     class
486  0.339246  0.608107
293  0.339246  0.247126
173  0.339246  0.247126
450  0.339246  0.507933
361  0.572638  0.507933
Test Encoded:
      embarked     class
281  0.339246  0.247126
435  0.339246  0.608107
39   0.572638  0.247126
418  0.339246  0.507933
585  0.339246  0.608107


## Compare all encodings

In [10]:
print("Final Encoded Dataset Columns:")
print(data_ohe.columns.tolist())

Final Encoded Dataset Columns:
['sex', 'class', 'embarked', 'survived', 'sex_label', 'embarked_C', 'embarked_Q', 'embarked_S']
