**FEATURE ENGINEERING**


ORDINAL ENCODING AND LABEL ENCODING

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
tc = sns.load_dataset('titanic')[['embarked', 'class', 'alive']]
tc['embarked'].fillna('S', inplace=True)
x = tc[['embarked', 'class']]
y = tc['alive']
#x['embarked'].value_counts()
# x['class'].value_counts()
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#OrdinalEncoding
oe = OrdinalEncoder(categories=[['S', 'C', 'Q'], ['Third', 'Second', 'First']])
oe.fit(X_train)
X_train_encoded = oe.transform(X_train)
X_test_encoded = oe.transform(X_test)
#LabelEncoding
le=LabelEncoder()
le.fit(y_train)
y_train_ed=le.transform(y_train)
y_test_ed=le.transform(y_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tc['embarked'].fillna('S', inplace=True)


In [2]:
#Checking values
print(oe.categories_)
print(le.classes_)

[array(['S', 'C', 'Q'], dtype=object), array(['Third', 'Second', 'First'], dtype=object)]
['no' 'yes']


**ONE HOT ENCODER**

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
tc = sns.load_dataset('titanic')[['survived', 'pclass', 'sex', 'age', 'fare', 'embarked', 'class', 'deck']]
tc['age'].fillna(tc['age'].mean(), inplace=True)
tc['deck'].fillna('C', inplace=True)
x = tc[['pclass', 'sex', 'age', 'fare', 'embarked', 'class', 'deck']]
y = tc['survived']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# OneHotEncoder instance for multiple categorical columns (sex, embarked, class, deck)
ohe = OneHotEncoder(drop='first', sparse_output=False)

# Fitting and transforming the categorical columns
X_train_ecd = ohe.fit_transform(X_train[['sex', 'embarked', 'class', 'deck']])
X_test_ecd = ohe.transform(X_test[['sex', 'embarked', 'class', 'deck']])

# Combine encoded categorical data with the numerical columns
X_train_new = np.hstack((X_train.drop(columns=['sex', 'embarked', 'class', 'deck']).values, X_train_ecd))
X_test_new = np.hstack((X_test.drop(columns=['sex', 'embarked', 'class', 'deck']).values, X_test_ecd))

# Now X_train_new and X_test_new are updated datasets with OneHotEncoding applied on all necessary columns.
print(ohe.categories_)

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S', nan], dtype=object), array(['First', 'Second', 'Third'], dtype=object), array(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype=object)]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tc['age'].fillna(tc['age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tc['deck'].fillna('C', inplace=True)


In [8]:
print(tc['deck'].isnull().sum())
tc['deck'].value_counts()
tc.sample(8)

0


Unnamed: 0,survived,pclass,sex,age,fare,embarked,class,deck
746,0,3,male,16.0,20.25,S,Third,C
351,0,1,male,29.699118,35.0,S,First,C
490,0,3,male,29.699118,19.9667,S,Third,C
9,1,2,female,14.0,30.0708,C,Second,C
441,0,3,male,20.0,9.5,S,Third,C
297,0,1,female,2.0,151.55,S,First,C
580,1,2,female,25.0,30.0,S,Second,C
858,1,3,female,24.0,19.2583,C,Third,C


**COLUMN TRANSFER**

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
tc=sns.load_dataset('titanic')[['survived','pclass','sex','age','fare']]
#preprocessing age
tc['age'].fillna(tc['age'].mean(), inplace=True)
#splitting into x and y
x=tc[['pclass','sex','age','fare']]
y=tc['survived']
X_train,X_test, y_train,y_test=train_test_split(x,y,test_size=0.2)
transformed=ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[[1,2,3]]),['pclass']),
    ('tnf2', OneHotEncoder(sparse_output=False, drop='first' ),['sex'])
], remainder='passthrough')
X_train_transformed=transformed.fit_transform(X_train)
X_test_transformed=transformed.transform(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tc['age'].fillna(tc['age'].mean(), inplace=True)


In [4]:
print(X_train_transformed)

[[ 1.          0.         31.         26.25      ]
 [ 0.          1.         29.69911765 39.6       ]
 [ 0.          1.         50.         55.9       ]
 ...
 [ 0.          1.         27.         53.1       ]
 [ 0.          0.         60.         75.25      ]
 [ 1.          1.         21.         73.5       ]]


In [5]:
tc.isnull().sum()
print('Survived')
print(tc['survived'].value_counts())
print('pclass')
print(tc['pclass'].value_counts())
print('sex')
print(tc['sex'].value_counts())

Survived
survived
0    549
1    342
Name: count, dtype: int64
pclass
pclass
3    491
1    216
2    184
Name: count, dtype: int64
sex
sex
male      577
female    314
Name: count, dtype: int64
