<a href="https://www.kaggle.com/code/ahmedalycess/titanic-survival-classification?scriptVersionId=164415251" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Titanic Survival Prediction
## 1. Import libraries and load the dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.metrics import f1_score, precision_score, recall_score

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
raw_data = pd.read_csv('/kaggle/input/titanic/train.csv')
raw_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
len(raw_data)

891

In [4]:
raw_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## 2. Cleaning the dataset
### 2.1. Dealing with missing values: 

In [5]:
# get missing values
raw_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### 2.1.1 Option 1: Drop columns with alot of missing data

In [6]:
# decide to drop Cabin column as it is 
#    missing too many values (687/891)
# params:
#    -labels: single label or list-like
#    -axis:   1 for dropping columns, 0 for dropping rows
clean_data = raw_data.drop(labels='Cabin', axis=1)  

In [7]:
clean_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

#### 2.1.2 Option 2: Filling in missing data
Method 1: fill with median

Method 2: fill by classifying them as Unkown

In [8]:
clean_data['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [9]:
# Since age is importent for us and it is not missing alot
# we could fill empty values with the median of the ages
median_age = clean_data['Age'].median()
print('Median Age is ', median_age)
clean_data['Age'] = clean_data['Age'].fillna(median_age)
clean_data['Age']

Median Age is  28.0


0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [10]:
clean_data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [11]:
# Embarked definition is "Port of Embarkation"
clean_data['Embarked'].unique() # S: Southampton, C: Cherbourg, Q: Queenstown

array(['S', 'C', 'Q', nan], dtype=object)

In [12]:
# another method to fill empty data especially if they are relatively 
# small and won't really affect the output like our case here with 'Embarked'
# which only has two NaNs out of 891 input is Classifying them 'U' as Unknown

clean_data['Embarked'] = clean_data['Embarked'].fillna('U')

In [13]:
clean_data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [14]:
clean_data.to_csv('./clean_titanic.csv', index=None)

## 3. Feature Engineering
### 3.1 Transform features

In [15]:
preprocessed_data = pd.read_csv('./clean_titanic.csv')
preprocessed_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


In [16]:
preprocessed_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [17]:
print('Sex classes: ', preprocessed_data['Sex'].unique())
print('Embarked classes: ', preprocessed_data['Embarked'].unique())
# print('Name: ', preprocessed_data['Name'].unique()) #Note: Names are very unique
# print('Ticket: ', preprocessed_data['Ticket'].unique()) #Note: Tickets are very unique
# conclusion --> can't one-hot encode features with many unique classes 
# (computetional cost >> reward)

Sex classes:  ['male' 'female']
Embarked classes:  ['S' 'C' 'Q' 'U']


In [18]:
# Turn every categorical feature to numerical one because 
# machine learning models performs alot of mathimatical operations
# Transforming method 1:

#     ONE-HOT ENCODING
# original column is 'gender' --> ['male', 'female']
# generated_columns 'gender_male', 'gender_female' --> 0 or 1 for each
gender_columns = pd.get_dummies(preprocessed_data['Sex'], prefix='Sex')
embarked_columns = pd.get_dummies(preprocessed_data['Embarked'], prefix='Embarked')

# concat the new columns to the dataset
#                                                               index=1 -->adding columns
preprocessed_data = pd.concat([preprocessed_data, gender_columns], axis=1) 
preprocessed_data = pd.concat([preprocessed_data, embarked_columns], axis=1)

# drop original columns
preprocessed_data = preprocessed_data.drop(['Sex', 'Embarked'], axis=1)
preprocessed_data.tail()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
886,887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0,False,True,False,False,True,False
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0,True,False,False,False,True,False
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",28.0,1,2,W./C. 6607,23.45,True,False,False,False,True,False
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0,False,True,True,False,False,False
890,891,0,3,"Dooley, Mr. Patrick",32.0,0,0,370376,7.75,False,True,False,True,False,False


In [19]:
preprocessed_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,False,True,False,False,True,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,True,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,True,False,False,False,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,True,False,False,False,True,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,False,True,False,False,True,False
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,True,False,False,False,True,False
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",28.0,1,2,W./C. 6607,23.4500,True,False,False,False,True,False
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,False,True,True,False,False,False


In [20]:
# Deciding whether to one-hot encode Pclass
# since Pclass is catorgies of 1, 2, or 3 let's see 
# if increasing/decreasing it directly correlate with 
# survival, if not we one-hot encode it

class_survived = preprocessed_data[['Pclass', 'Survived']]
first_class = class_survived[class_survived['Pclass']==1]
second_class = class_survived[class_survived['Pclass']==2]
third_class = class_survived[class_survived['Pclass']==3]

print('First class survivors: ', sum(first_class['Survived'])/len(first_class) * 100, '%')
print('Second class survivors: ', sum(second_class['Survived'])/len(second_class) * 100, '%')
print('Third class survivors: ', sum(third_class['Survived'])/len(third_class) * 100, '%')
# decision: leave it since it looks like there is a trend of 
# decreasing of survival w.r.t. class type

First class survivors:  62.96296296296296 %
Second class survivors:  47.28260869565217 %
Third class survivors:  24.236252545824847 %


In [21]:
# split age to multiple buckets since the relation between age feature and survival
# is likely not linear --> ex. ages 20~30 are more likely to survive by nature than
# young and old people. thus, although a non-linear model could detect such pattern
# it is better to catorgrize age into buckets so linear models could catch
# the patterns if possible

print('Max age: ', preprocessed_data['Age'].max())
print('Min age: ', preprocessed_data['Age'].min())

Max age:  80.0
Min age:  0.42


In [22]:
# Buckets [0 to 10, 11 to 20, ..., older than 81 ]
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
catogrized_age = pd.cut(preprocessed_data['Age'], bins)
preprocessed_data['Categorized_age'] = catogrized_age
preprocessed_data = preprocessed_data.drop(['Age'], axis=1)

In [23]:
categorized_age_columns = pd.get_dummies(preprocessed_data['Categorized_age'], prefix='Categorized_age')
preprocessed_data = pd.concat([preprocessed_data, categorized_age_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Categorized_age'], axis=1)

In [24]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,...,Embarked_S,Embarked_U,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,1,0,3,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,False,True,...,True,False,False,False,True,False,False,False,False,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,True,False,...,False,False,False,False,False,True,False,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,True,False,...,True,False,False,False,True,False,False,False,False,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,True,False,...,True,False,False,False,False,True,False,False,False,False
4,5,0,3,"Allen, Mr. William Henry",0,0,373450,8.05,False,True,...,True,False,False,False,False,True,False,False,False,False


### 3.2 Drop unnecessary features 

In [25]:
# Dropping decision is usually taken due to one of two reasons:
# 1- Too many missing values like the case above with 'Cabin'
# 2- If the feature would ruin the model's accuracy rather than help identifying patterns
preprocessed_data.info()
print('-------------------------------------------')
preprocessed_data.nunique()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   PassengerId               891 non-null    int64  
 1   Survived                  891 non-null    int64  
 2   Pclass                    891 non-null    int64  
 3   Name                      891 non-null    object 
 4   SibSp                     891 non-null    int64  
 5   Parch                     891 non-null    int64  
 6   Ticket                    891 non-null    object 
 7   Fare                      891 non-null    float64
 8   Sex_female                891 non-null    bool   
 9   Sex_male                  891 non-null    bool   
 10  Embarked_C                891 non-null    bool   
 11  Embarked_Q                891 non-null    bool   
 12  Embarked_S                891 non-null    bool   
 13  Embarked_U                891 non-null    bool   
 14  Categorize

PassengerId                 891
Survived                      2
Pclass                        3
Name                        891
SibSp                         7
Parch                         7
Ticket                      681
Fare                        248
Sex_female                    2
Sex_male                      2
Embarked_C                    2
Embarked_Q                    2
Embarked_S                    2
Embarked_U                    2
Categorized_age_(0, 10]       2
Categorized_age_(10, 20]      2
Categorized_age_(20, 30]      2
Categorized_age_(30, 40]      2
Categorized_age_(40, 50]      2
Categorized_age_(50, 60]      2
Categorized_age_(60, 70]      2
Categorized_age_(70, 80]      2
dtype: int64

In [26]:
# We see that PassengerId, Name,and Ticket features would be bad for the model
# (result to bad accuracy and overfitting) because of how unique they are.
preprocessed_data = preprocessed_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

In [27]:
preprocessed_data

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,3,1,0,7.2500,False,True,False,False,True,False,False,False,True,False,False,False,False,False
1,1,1,1,0,71.2833,True,False,True,False,False,False,False,False,False,True,False,False,False,False
2,1,3,0,0,7.9250,True,False,False,False,True,False,False,False,True,False,False,False,False,False
3,1,1,1,0,53.1000,True,False,False,False,True,False,False,False,False,True,False,False,False,False
4,0,3,0,0,8.0500,False,True,False,False,True,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,13.0000,False,True,False,False,True,False,False,False,True,False,False,False,False,False
887,1,1,0,0,30.0000,True,False,False,False,True,False,False,True,False,False,False,False,False,False
888,0,3,1,2,23.4500,True,False,False,False,True,False,False,False,True,False,False,False,False,False
889,1,1,0,0,30.0000,False,True,True,False,False,False,False,False,True,False,False,False,False,False


In [28]:
# save the preprocessed data to be used directly later
preprocessed_data.to_csv('./preprocessed_titanic_data.csv', index=None)

In [29]:
preprocessed_data = pd.read_csv('./preprocessed_titanic_data.csv')
preprocessed_data

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,3,1,0,7.2500,False,True,False,False,True,False,False,False,True,False,False,False,False,False
1,1,1,1,0,71.2833,True,False,True,False,False,False,False,False,False,True,False,False,False,False
2,1,3,0,0,7.9250,True,False,False,False,True,False,False,False,True,False,False,False,False,False
3,1,1,1,0,53.1000,True,False,False,False,True,False,False,False,False,True,False,False,False,False
4,0,3,0,0,8.0500,False,True,False,False,True,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,13.0000,False,True,False,False,True,False,False,False,True,False,False,False,False,False
887,1,1,0,0,30.0000,True,False,False,False,True,False,False,True,False,False,False,False,False,False
888,0,3,1,2,23.4500,True,False,False,False,True,False,False,False,True,False,False,False,False,False
889,1,1,0,0,30.0000,False,True,True,False,False,False,False,False,True,False,False,False,False,False


In [30]:
features = preprocessed_data.drop(['Survived'], axis=1)
labels = preprocessed_data['Survived']

In [31]:
features_train, features_validation_test, labels_train, labels_validation_test = train_test_split(features,labels, test_size=0.4,random_state=100)
features_validation, features_test, labels_validation, labels_test = train_test_split(features_validation_test, labels_validation_test, test_size=0.5, random_state=100)

In [32]:
print('Features: ',len(features))
print('Train Split: ', len(features_train), 'Validation Split: ', len(features_validation), 'Test Split: ', len(features_test))

Features:  891
Train Split:  534 Validation Split:  178 Test Split:  179


## 4. Training models
### [ 
#### Logistic regression,
#### Decision Tree Classifier,
#### Naive Bayes,
#### Support Vector Machine,
#### Random Forest Classifier,
#### Gradient Boosting Classifier,
#### Ada Boosting Classifier,
#### XGBoosting Classifier
### ]

In [33]:
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    GaussianNB(),
    SVC(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier()
]

In [34]:
for model in models:
    model.fit(features_train, labels_train)
    print(f'{type(model).__name__} trained successfully.')

LogisticRegression trained successfully.
DecisionTreeClassifier trained successfully.
GaussianNB trained successfully.
SVC trained successfully.
RandomForestClassifier trained successfully.
GradientBoostingClassifier trained successfully.
AdaBoostClassifier trained successfully.


### 5.0 Evaluating all models
#### 5.1 Accuracy

In [35]:
best_accuracy_index, best_accuracy = 0, 0
for i, model in enumerate(models):
    score = model.score(features_validation, labels_validation)
    if score > best_accuracy:
        best_accuracy = score
        best_accuracy_index = i
    print(f'{type(model).__name__}: {score}')
print('----------------------------------------------')
print(f'Highest accuracy {type(models[best_accuracy_index]).__name__} with {best_accuracy}')

LogisticRegression: 0.7696629213483146
DecisionTreeClassifier: 0.7696629213483146
GaussianNB: 0.7415730337078652
SVC: 0.6797752808988764
RandomForestClassifier: 0.7752808988764045
GradientBoostingClassifier: 0.8089887640449438
AdaBoostClassifier: 0.7640449438202247
----------------------------------------------
Highest accuracy GradientBoostingClassifier with 0.8089887640449438


#### 5.2 F1-score, Precision, Recall

In [36]:
# using different metrics to evaluate each model
best_f1_index, best_f1_score = 0, 0
best_precision_index, best_precision_score = 0, 0
best_recall_index, best_recall_score = 0, 0

for i, model in enumerate(models):
    # predict label of validation data
    model_predicted_label = model.predict(features_validation)
    
    # Calculate f1 score --> 2TP / (2 * TP + FP + FN)
    
    model_f1_score = f1_score(labels_validation, model_predicted_label)
    if model_f1_score > best_f1_score:
        best_f1_index = i
        best_f1_score = model_f1_score
    
    # Calculate precision score --> TP / (TP+FP) (disc: all true positive predicted / all positive whether true or not)
    # Intuitively: ability of the model not to label as positive a sample that is negative
    
    model_precision_score = precision_score(labels_validation, model_predicted_label)
    if model_precision_score > best_precision_score:
        best_precision_index = i
        best_precision_score = model_precision_score

    # Calculate recall score --> TP / (TP + FN) (disc: all true positive predicted / all true positive generally )
    model_recall_score = recall_score(labels_validation, model_predicted_label)
    if model_recall_score > best_recall_score:
        best_recall_index = i
        best_recall_score = model_recall_score
    
    print('{} \n f1: {:.2f}, precision: {:.2f}, recall: {:.2f}'.format(type(model).__name__, 
                                                                     model_f1_score,
                                                                     model_precision_score,
                                                                     model_recall_score
                                                                    ))
print('------------------------------------------------------')
print(f'Highest f1-score {type(models[best_f1_index]).__name__} with {best_f1_score}')
print(f'Highest precision-score {type(models[best_precision_index]).__name__} with {best_precision_score}')
print(f'Highest recall-score {type(models[best_recall_index]).__name__} with {best_recall_score}')
    

LogisticRegression 
 f1: 0.69, precision: 0.74, recall: 0.64
DecisionTreeClassifier 
 f1: 0.69, precision: 0.74, recall: 0.64
GaussianNB 
 f1: 0.66, precision: 0.68, recall: 0.64
SVC 
 f1: 0.40, precision: 0.76, recall: 0.27
RandomForestClassifier 
 f1: 0.70, precision: 0.73, recall: 0.67
GradientBoostingClassifier 
 f1: 0.74, precision: 0.80, recall: 0.69
AdaBoostClassifier 
 f1: 0.68, precision: 0.73, recall: 0.63
------------------------------------------------------
Highest f1-score GradientBoostingClassifier with 0.7384615384615385
Highest precision-score GradientBoostingClassifier with 0.8
Highest recall-score GradientBoostingClassifier with 0.6857142857142857


##### Conclusion: Gradient Boosting Classifier is the best model for this dataset <br> if we didn't tune the hyperparameters
#### Testing the model

In [37]:
gb_model = models[best_f1_index]
gb_model.score(features_test, labels_test)

0.8324022346368715

In [38]:
gb_predicted_test_labels = gb_model.predict(features_test)
f1_score(labels_test, gb_predicted_test_labels)

0.8026315789473685

In [39]:
precision_score(labels_test, gb_predicted_test_labels)

0.8840579710144928

In [40]:
recall_score(labels_test, gb_predicted_test_labels)

0.7349397590361446

### Loading test.csv and predicting labels

In [41]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [42]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [43]:
# saving PassengerId to create submission.csv later
passenger_id = test_data['PassengerId']

# Cleaning the data
test_data = test_data.drop('Cabin', axis=1)
average_age = test_data["Age"].median()
test_data["Age"] = test_data["Age"].fillna(average_age)

average_fare = test_data["Fare"].median()
test_data['Fare'] = test_data['Fare'].fillna(average_fare)

# Preprocessing the data
test_gender_columns = pd.get_dummies(test_data['Sex'], prefix='Sex')
test_embarked_columns = pd.get_dummies(test_data["Embarked"], prefix="Embarked")
test_data = pd.concat([test_data, test_gender_columns], axis=1)
test_data = pd.concat([test_data, test_embarked_columns], axis=1)
test_data = test_data.drop(['Sex', 'Embarked'], axis=1)

# Artificially adding 'Unknown' class to 'Embarked' feature since it was needed in
# the training data. Add it and fill it with zeros.
test_data['Embarked_U'] = pd.DataFrame([0 for i in range(len(test_data))])

# test_categorized_pclass_columns = pd.get_dummies(test_data['Pclass'], prefix='Pclass')
# test_data = pd.concat([test_data, test_categorized_pclass_columns], axis=1)
# test_data = test_data.drop(['Pclass'], axis=1)

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
test_categorized_age = pd.cut(test_data['Age'], bins)
test_data['Categorized_age'] = test_categorized_age
test_data = test_data.drop(["Age"], axis=1)

test_cagegorized_age_columns = pd.get_dummies(test_data['Categorized_age'], prefix='Categorized_age')
test_data = pd.concat([test_data, test_cagegorized_age_columns], axis=1)
test_data = test_data.drop(['Categorized_age'], axis=1)
test_data = test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
test_data

Unnamed: 0,Pclass,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,3,0,0,7.8292,False,True,False,True,False,0,False,False,False,True,False,False,False,False
1,3,1,0,7.0000,True,False,False,False,True,0,False,False,False,False,True,False,False,False
2,2,0,0,9.6875,False,True,False,True,False,0,False,False,False,False,False,False,True,False
3,3,0,0,8.6625,False,True,False,False,True,0,False,False,True,False,False,False,False,False
4,3,1,1,12.2875,True,False,False,False,True,0,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,0,8.0500,False,True,False,False,True,0,False,False,True,False,False,False,False,False
414,1,0,0,108.9000,True,False,True,False,False,0,False,False,False,True,False,False,False,False
415,3,0,0,7.2500,False,True,False,False,True,0,False,False,False,True,False,False,False,False
416,3,0,0,8.0500,False,True,False,False,True,0,False,False,True,False,False,False,False,False


In [44]:
for model in models:
    print('{}: {}'.format(type(model).__name__,sum(model.predict(test_data))))

LogisticRegression: 144
DecisionTreeClassifier: 142
GaussianNB: 178
SVC: 61
RandomForestClassifier: 142
GradientBoostingClassifier: 150
AdaBoostClassifier: 150


#### Go for GradientBoostingClassifier

In [45]:
gb_index = 5
predicitions = models[gb_index].predict(test_data)

In [46]:
submission_df = pd.DataFrame({'PassengerId': passenger_id, 'Survived': predicitions})
submission_df.to_csv('./submission.csv', index=None)