In [78]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings('ignore')

In [79]:
df = pd.read_csv('train.csv')[['Age','Pclass','SibSp','Parch','Survived']]
print(df.shape)
df.sample(6)

(891, 5)


Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
746,16.0,3,1,1,0
394,24.0,3,0,2,1
286,30.0,3,0,0,1
833,23.0,3,0,0,0
542,11.0,3,4,2,0
85,33.0,3,3,0,1


In [80]:
df.dropna(inplace=True)
df.shape

(714, 5)

In [81]:
X = df.iloc[:,[0,1,2,3]]
y = df.iloc[:,-1]


lr = LogisticRegression()
print(f"CROSS VAL SCORE : {np.round(
    np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20)), 2
    )*100} %")



CROSS VAL SCORE : 69.0 %


### **Using Feature Construction**

In [82]:
X['Total Passengers'] = X['SibSp'] + X['Parch'] + 1
X.drop(columns=['SibSp', 'Parch'], inplace=True)
X

Unnamed: 0,Age,Pclass,Total Passengers
0,22.0,3,2
1,38.0,1,2
2,26.0,3,1
3,35.0,1,2
4,35.0,3,1
...,...,...,...
885,39.0,3,6
886,27.0,2,1
887,19.0,1,1
889,26.0,1,1


In [83]:
# making a function to create a new feature, which holds travellers travelling with a small or big size family
def myfunc(num):
    if num==1:
        # travelling alone = 0
        return 0
    elif num>1 and num<=4:
        # travelling with small family = 1
        return 1
    else:
        # travelling with a big family = 2
        return 2

X['Family Type'] = X['Total Passengers'].apply(myfunc)
X.sample(6)

Unnamed: 0,Age,Pclass,Total Passengers,Family Type
649,23.0,3,1,0
434,50.0,1,2,1
885,39.0,3,6,2
737,35.0,1,1,0
463,48.0,2,1,0
417,18.0,2,3,1


In [None]:
print(
    np.round(
        np.mean(cross_val_score(lr, X,y,scoring='accuracy', cv=20)), 2)*100, '%'
    )       # as you can see the accuracy has increased from 69 % to 70%

70.0 %


---
---
---

### **Feature splitting** - *Splitting string data*

In [104]:
df = pd.read_csv('train.csv')
print(df['Name'].isna().sum(),'\n')
df['Name'].sample(6)

0 



736    Ford, Mrs. Edward (Margaret Ann Watson)
256             Thorne, Mrs. Gertrude Maybelle
561                          Sivic, Mr. Husein
388                       Sadlier, Mr. Matthew
348     Coutts, Master. William Loch "William"
648                         Willey, Mr. Edward
Name: Name, dtype: object

In [157]:
df['Title'] = df['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0]
# It splits the string into separate columns.
# Without expand=True, pandas returns a LIST inside a single column.
# With expand=True, pandas creates new columns for each split part.

print(f"UNIQUE : {df['Title'].nunique()} \n")
print(f"UNIQUE : {df['Title'].unique()} \n")
df[['Title', 'Name']].sample(6)

UNIQUE : 17 

UNIQUE : [' Mr' ' Mrs' ' Miss' ' Master' ' Don' ' Rev' ' Dr' ' Mme' ' Ms' ' Major'
 ' Lady' ' Sir' ' Mlle' ' Col' ' Capt' ' the Countess' ' Jonkheer'] 



Unnamed: 0,Title,Name
583,Mr,"Ross, Mr. John Hugo"
142,Mrs,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda ..."
714,Mr,"Greenberg, Mr. Samuel"
561,Mr,"Sivic, Mr. Husein"
513,Mrs,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)"
846,Mr,"Sage, Mr. Douglas Bullen"
