In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Let's combine data for preprocessing

In [5]:
data = pd.concat([train_data, test_data], ignore_index=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [6]:
data.loc[data['Sex'] == 'female', 'Sex'] = 0
data.loc[data['Sex'] == 'male', 'Sex'] = 1
data['Sex'] = data['Sex'].astype(int)

Let's do some additional work with the data:

let's highlight the column "address to the passenger", through the appeal we can find boys whose survival rate is higher than adult men. This is important when the passenger's age is missing.
let's highlight the last name to form families and somehow use it for forecasting.

Function for extracting a title from a name.

Apply the function to the "Name" column and create a new "Title" column.

In [7]:
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

data['Title'] = data['Name'].apply(extract_title)

In [8]:
(
    data.groupby('Title')
    .agg({'Survived': 'mean', 'PassengerId': 'count'})
    .sort_values(by= 'PassengerId', ascending= False)
    .rename(columns={'PassengerId': 'Count'})
)

Unnamed: 0_level_0,Survived,Count
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Mr,0.156673,757
Miss,0.697802,260
Mrs,0.792,197
Master,0.575,61
Rev,0.0,8
Dr,0.428571,8
Col,0.5,4
Ms,1.0,2
Major,0.5,2
Mlle,1.0,2


We see that treatment (Title) significantly affects survival.

In [9]:
Mrs = ['Mlle', 'Mme', 'Ms', 'Lady', 'the Countess', 'Dona']

Aristocrat = ['Dr', 'Don', 'Major', 'Sir', 'Col', 'Capt', 'Jonkheer']

data.loc[data['Title'].isin(Mrs), 'Title'] = 'Mrs'

data.loc[data['Title'].isin(Aristocrat), 'Title'] = 'Aristocrat'

In [10]:
data['Title'].value_counts()

Title
Mr            757
Miss          260
Mrs           205
Master         61
Aristocrat     18
Rev             8
Name: count, dtype: int64

Let's look at the missing data and fill in Cabin and Fare.

In [11]:
data.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Title             0
dtype: int64

In [12]:
data.loc[data['Fare'].isnull(), 'Pclass']

1043    3
Name: Pclass, dtype: int64

In [13]:
data.loc[(data['Fare'].isnull()), 'Fare'] = data[data['Pclass'] == 3]['Fare'].mean()

The presence or absence of information about the passenger cabin is an important sign affecting survivability.

In [14]:
train_data[train_data['Cabin'].notnull()].groupby(['Sex', 'Pclass'])['Survived'].mean()

Sex     Pclass
female  1         0.962963
        2         0.900000
        3         0.666667
male    1         0.410526
        2         0.666667
        3         0.333333
Name: Survived, dtype: float64

In [15]:
train_data[train_data['Cabin'].isnull()].groupby(['Sex', 'Pclass'])['Survived'].mean()

Sex     Pclass
female  1         1.000000
        2         0.924242
        3         0.492754
male    1         0.222222
        2         0.127451
        3         0.131965
Name: Survived, dtype: float64

Let's create a column with the presence or absence of information about the passenger cabin.

In [16]:
data.loc[data['Cabin'].isnull(), 'Cabin'] = 'NA'

data['Is_cabin'] = 1

data.loc[data['Cabin'] == 'NA', 'Is_cabin'] = 0

In [17]:
data['Is_cabin'].value_counts()

Is_cabin
0    1014
1     295
Name: count, dtype: int64

Let's create a Family column. The family will contain the last name, number of family members and passenger class.

In [18]:
data['Family_size'] = data.SibSp + data.Parch + 1

In [19]:
def extract_surname(name):
    return name.split(',')[0].strip()

data['Surname'] = data['Name'].apply(extract_surname)


def family(df):   
    return df['Surname'] + '-' + str(df['Family_size']) + '-' + 'Pclass' + '-' + str(df['Pclass'])  

data['Family'] = data.apply(family, axis=1)

In [20]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Is_cabin,Family_size,Surname,Family
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,Mr,0,2,Braund,Braund-2-Pclass-3
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1,2,Cumings,Cumings-2-Pclass-1
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0,1,Heikkinen,Heikkinen-1-Pclass-3
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,Mrs,1,2,Futrelle,Futrelle-2-Pclass-1
4,5,0.0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,Mr,0,1,Allen,Allen-1-Pclass-3


Let's remove features that are not valuable.

In [21]:
data = data.drop(columns=['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Embarked', 'Surname'], axis=1)

Converting categorical features to numbers using One-Hot Encoding

In [22]:
data_temp = data.drop(columns=['Family_size'], axis=1)
data_temp = pd.get_dummies(data_temp)

Let's divide the data into train_data and test_data

In [23]:
train_data = data_temp[data_temp['Survived'].notnull()] 
test_data = data_temp[data_temp['Survived'].isnull()] 

Let's divide the train_data into features (matrix X) and target variable (y)

In [24]:
y = train_data['Survived'] 
X = train_data.drop(columns = ['Survived'], axis = 1) 

Let's divide the data into training and validation samples

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

Let's set the Random Forest algorithm for the model

In [26]:
model_rf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=1)

model_rf.fit(X_train, y_train)

Let's write a function to calculate metrics.

In [27]:
def metric_calculation(y_test, X_test, model):
    y_pred = model.predict(X_test)
    y_scores = model.predict_proba(X_test)[:, 1]
    print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
    print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
    print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
    print('ROC AUC: {:.2f}'.format(roc_auc_score(y_test, y_scores)))

We will get the accuracy, precision, recall and ROC AUC metrics for the model.

In [28]:
metric_calculation(y_val, X_val, model_rf)

Accuracy: 0.83
Precision: 0.84
Recall: 0.70
ROC AUC: 0.86


Let's make a forecast.

In [29]:
X_test = test_data.drop(columns = ['Survived'], axis=1)

In [30]:
model_rf.fit(X, y)

predictions = model_rf.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})

output['Survived'] = output['Survived'].astype(int)

output.to_csv('submission-f-1.csv', index=False)

In [31]:
output["Survived"].value_counts()

Survived
0    312
1    106
Name: count, dtype: int64

#### Kaggle Accuracy : 79.90

#### Let's improve the forecast by grouping passengers by family.
Principle: we will remove adult men from families, leaving women and children. Let's find the average survival rate. If most members in the family survived, then the unknown (Survived == nan) family member also survived and vice versa. Most likely, if the mother is alive, then the child is alive and vice versa.

In [32]:
data_family = data.query('Title != "Mr" and Title != "Rev" and Title != "Aristocrat" and Family_size > 1')

In [33]:
data_family.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Fare,Cabin,Title,Is_cabin,Family_size,Family
1,2,1.0,1,0,71.2833,C85,Mrs,1,2,Cumings-2-Pclass-1
3,4,1.0,1,0,53.1,C123,Mrs,1,2,Futrelle-2-Pclass-1
7,8,0.0,3,1,21.075,,Master,0,5,Palsson-5-Pclass-3
8,9,1.0,3,0,11.1333,,Mrs,0,3,Johnson-3-Pclass-3
9,10,1.0,2,0,30.0708,,Mrs,0,2,Nasser-2-Pclass-2


In [34]:
data_family['Survived_family'] = np.nan

mean_survived_by_family = data_family.groupby('Family')['Survived'].mean()

data_family['Survived_family'] = data_family['Family'].map(mean_survived_by_family)

data_family.loc[data_family['Survived_family'] < 0.34, 'Survived_family' ] = 0

data_family = data_family.loc[(data_family['Survived_family'] == 0) | (data_family['Survived_family'] == 1)]

In [35]:
output.loc[output['PassengerId'].isin(data_family['PassengerId']), 'Survived'] = data_family['Survived_family']

output["Survived"].value_counts()

Survived
0    303
1    115
Name: count, dtype: int64

In [36]:
output['Survived'] = output['Survived'].astype(int)

output.to_csv('submission-f-2.csv', index=False)

#### Kaggle Accuracy : 81.58
#### # 194