Feature engineering is something where you create new features from existing features, depending the problem you are working on and along with that what you thing is relevant to the problem that helps you reach the end goal.

In [51]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings('ignore')

In [52]:
df = pd.read_csv('train.csv')[['Age','Pclass','SibSp','Parch','Survived']]
print(df.shape)
df.sample(6)

(891, 5)


Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
138,16.0,3,0,0,0
843,34.5,3,0,0,0
820,52.0,1,1,1,1
3,35.0,1,1,0,1
270,,1,0,0,0
406,51.0,3,0,0,0


In [53]:
df.dropna(inplace=True)
df.shape

(714, 5)

In [54]:
X = df.iloc[:,[0,1,2,3]]
y = df.iloc[:,-1]


lr = LogisticRegression()
print(f"CROSS VAL SCORE : {np.round(
    np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20)), 2
    )*100} %")



CROSS VAL SCORE : 69.0 %


### **Using Feature Construction**

In [55]:
X['Total Passengers'] = X['SibSp'] + X['Parch'] + 1     # here we added 1 bcoz it represents the individual itself
X.drop(columns=['SibSp', 'Parch'], inplace=True)
X.sample(6)

Unnamed: 0,Age,Pclass,Total Passengers
190,32.0,2,1
463,48.0,2,1
500,17.0,3,1
8,27.0,3,3
555,62.0,1,1
122,32.5,2,2


In [56]:
# making a function to create a new feature, which holds travellers travelling with a small or big size family
def myfunc(num):
    if num==1:
        # travelling alone = 0
        return 0
    elif num>1 and num<=4:
        # travelling with small family = 1
        return 1
    else:
        # travelling with a big family = 2
        return 2

X['Family Type'] = X['Total Passengers'].apply(myfunc)
X.sample(6)

Unnamed: 0,Age,Pclass,Total Passengers,Family Type
249,54.0,2,2,1
483,63.0,3,1,0
441,20.0,3,1,0
165,9.0,3,3,1
156,16.0,3,1,0
323,22.0,2,3,1


In [57]:
print(
    np.round(
        np.mean(cross_val_score(lr, X,y,scoring='accuracy', cv=20)), 2)*100, '%'
    )       # as you can see the accuracy has increased from 69 % to 70%

70.0 %


---
---
---

### **Feature splitting** - *Splitting string data*

In [58]:
df = pd.read_csv('train.csv')
print(df['Name'].isna().sum(),'\n')
df['Name'].sample(6)

0 



483      Turkula, Mrs. (Hedwig)
303         Keane, Miss. Nora A
532        Elias, Mr. Joseph Jr
77     Moutal, Mr. Rahamin Haim
327     Ball, Mrs. (Ada E Hall)
401             Adams, Mr. John
Name: Name, dtype: object

In [59]:
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
# expand=True -- It splits the string into separate columns.
# Without expand=True, pandas returns a LIST inside a single column.
# With expand=True, pandas creates new columns for each split part.

print(f"UNIQUE : {df['Title'].nunique()} \n")
print(f"UNIQUE : {df['Title'].unique()} \n")
df[['Title', 'Name']].sample(6)

UNIQUE : 17 

UNIQUE : ['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer'] 



Unnamed: 0,Title,Name
679,Mr,"Cardeza, Mr. Thomas Drake Martinez"
533,Mrs,"Peter, Mrs. Catherine (Catherine Rizk)"
426,Mrs,"Clarke, Mrs. Charles V (Ada Maria Winfield)"
156,Miss,"Gilnagh, Miss. Katherine ""Katie"""
826,Mr,"Lam, Mr. Len"
296,Mr,"Hanna, Mr. Mansour"


In [64]:
# representing survival rate along with the title
a = ((df.groupby('Title')['Survived'].mean()).sort_values(ascending=False).reset_index(name='Survival_Rate'))
a

Unnamed: 0,Title,Survival_Rate
0,Lady,1.0
1,Ms,1.0
2,Sir,1.0
3,Mme,1.0
4,the Countess,1.0
5,Mlle,1.0
6,Mrs,0.792
7,Miss,0.697802
8,Master,0.575
9,Major,0.5
