# Feature Engineering Assignment (Titanic Dataset)



## Step 1: Import Libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
from scipy import stats

df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Step 2: Handle Missing Values

In [5]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin'], inplace=True)


## Step 3: Handle Categorical Values

In [7]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,2


## Step 4: Remove Outliers (Z-Score Method)

In [8]:
z = np.abs(stats.zscore(df['Fare']))
df = df[(z < 3)]
df.shape

(871, 11)

## Step 5: Feature Scaling

In [9]:
scaler = StandardScaler()
num_cols = ['Age','Fare','SibSp','Parch']
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,-0.562191,0.438043,-0.462507,A/5 21171,-0.657256,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.6654,0.438043,-0.462507,PC 17599,1.525623,0
2,3,1,3,"Heikkinen, Miss. Laina",0,-0.255293,-0.470374,-0.462507,STON/O2. 3101282,-0.634246,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.435227,0.438043,-0.462507,113803,0.905759,2
4,5,0,3,"Allen, Mr. William Henry",1,0.435227,-0.470374,-0.462507,373450,-0.629984,2


## Step 6: Feature Selection (SelectKBest)

In [10]:
X = df.drop(columns=['Survived','Name','Ticket','PassengerId'])
y = df['Survived']

selector = SelectKBest(score_func=chi2, k=5)
X_new = selector.fit_transform(abs(X), y)

selected_features = X.columns[selector.get_support()]
selected_features

Index(['Pclass', 'Sex', 'SibSp', 'Fare', 'Embarked'], dtype='object')

## Step 7: PCA (Dimensionality Reduction)

In [11]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
pca.explained_variance_ratio_

array([0.30654406, 0.28291024])

## Conclusion
We performed all major feature engineering steps successfully.