In [2]:
# step-1 load dataset

import pandas as pd

dataset = pd.read_csv("Datasets/Cleaned_dataset_titanic.csv")

In [4]:
dataset.head(3)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,age_group
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Adult


In [6]:
# Step 2: apply() on a Column (Row Logic)
# Create age_group

def age_group(age):
    if age < 18:
        return 'child'
    elif age <= 60:
        return 'adult'
    else:
        return 'senior'

dataset['age_group'] = dataset['age'].apply(age_group)

In [8]:
dataset.head(3)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,age_group
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,adult


In [12]:
# Step 3: lambda (Short Logic)

dataset['fare_level'] = dataset['fare'].apply(
    lambda x: 'High' if x > 100 else 'Low'
)

In [14]:
dataset.head(3)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,age_group,fare_level
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,adult,Low
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,adult,Low
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,adult,Low


In [32]:
# Step 4: apply() on Rows (axis=1) 
# Use only when necessary (slower).

dataset['family_size'] = dataset.apply(
    lambda row : row['sibsp'] + row['parch'] + 1,
    axis = 1
)

In [30]:
dataset.head(3)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,age_group,fare_level,family_size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,adult,Low,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,adult,Low,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,adult,Low,1


In [36]:
# Step 5: Vectorized Operations (BEST PRACTICE)
# Faster
# Cleaner
# Preferred in interviews

dataset['family_size_vec'] = dataset['sibsp'] + dataset['parch']

In [38]:
dataset.head(2)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,age_group,fare_level,family_size,family_size_vec
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,adult,Low,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,adult,Low,2,1


In [40]:
# Step 6: np.where() (Pro Trick)

import numpy as np

dataset['is_alone'] = np.where(dataset['family_size_vec'] == 1, 'Yes', 'No')

In [42]:
dataset.head(2)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,age_group,fare_level,family_size,family_size_vec,is_alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,adult,Low,2,1,Yes
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,adult,Low,2,1,Yes
