In [254]:
#The goal is to predict whether or not a passenger survived based on attributes such 
#as their age, sex, passenger class, where they embarked and so on.

In [255]:


import os
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()    



In [256]:


import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)



In [257]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")


    PassengerId: a unique identifier for each passenger
    Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
    Pclass: passenger class.
    Name, Sex, Age: self-explanatory
    SibSp: how many siblings & spouses of the passenger aboard the Titanic.
    Parch: how many children & parents of the passenger aboard the Titanic.
    Ticket: ticket id
    Fare: price paid (in pounds)
    Cabin: passenger's cabin number
    Embarked: where the passenger embarked the Titanic


In [258]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [259]:
train_data.isna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [260]:
train_data[train_data == 0].count()

PassengerId      0
Survived       549
Pclass           0
Name             0
Sex              0
Age              0
SibSp          608
Parch          678
Ticket           0
Fare            15
Cabin            0
Embarked         0
dtype: int64

In [261]:
train_data.describe() #before changes

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699113,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526507,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.4167,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [262]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [263]:
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

In [264]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [265]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [266]:
train_data["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [267]:
train_data["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [268]:
train_data["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [269]:
train_data["Embarked"].value_counts()

#The Embarked attribute tells us where the passenger embarked: C=Cherbourg, Q=Queenstown, S=Southampton.


S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [270]:

train_data[train_data["Sex"] == "female"]["Age"].median()

27.0

In [271]:
train_data[train_data["Sex"] == "male"]["Age"].median()

29.0

In [272]:

#train_data[(train_data["Sex"] == "female") & (train_data["Age"] == 0)].fillna({'Age': 27})
#train_data

In [273]:
# replacing null value in age column, at the instance of age median for each genders
train_data.loc[(train_data["Sex"] == "female") & (train_data["Age"].isna()), 'Age'] = 27
train_data.loc[(train_data["Sex"] == "male") & (train_data["Age"].isna()), 'Age'] = 29

In [274]:
import numpy as np

train_data["Age"].astype(np.uint8) # convert to int

PassengerId
1      22
2      38
3      26
4      35
5      35
       ..
887    27
888    19
889    27
890    26
891    32
Name: Age, Length: 891, dtype: uint8

In [275]:
train_data.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.073297,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.338056,0.083081,0.018443,-0.5495
Age,-0.073297,-0.338056,1.0,-0.236376,-0.176038,0.09416
SibSp,-0.035322,0.083081,-0.236376,1.0,0.414838,0.159651
Parch,0.081629,0.018443,-0.176038,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.09416,0.159651,0.216225,1.0


In [283]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])

In [284]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [361]:
from sklearn.compose import ColumnTransformer
num_attributes = ["Age" ,"SibSp", "Fare", "Parch", "Pclass"]
cat_attributes = ["Sex", "Embarked"]


preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attributes),
    ("cat", cat_pipeline, cat_attributes),
])



In [362]:
X_train = preprocess_pipeline.fit_transform(train_data[num_attributes + cat_attributes])
X_train

array([[-0.57190135,  0.43279337, -0.50244517, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.65778503,  0.43279337,  0.78684529, ...,  1.        ,
         0.        ,  0.        ],
       [-0.26447976, -0.4745452 , -0.48885426, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.18762436,  0.43279337, -0.17626324, ...,  0.        ,
         0.        ,  1.        ],
       [-0.26447976, -0.4745452 , -0.04438104, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.19665263, -0.4745452 , -0.49237783, ...,  0.        ,
         1.        ,  0.        ]])

In [363]:
y_train = train_data["Survived"]

In [364]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=200, random_state=42)
forest_clf.fit(X_train, y_train)

In [365]:
X_test = preprocess_pipeline.transform(test_data[num_attributes + cat_attributes])
y_pred = forest_clf.predict(X_test)


In [366]:
from sklearn.model_selection import cross_val_score
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=20)
forest_scores.mean()

0.8181818181818181

In [367]:

from sklearn.svm import SVC
svc_model = SVC(gamma="auto")

In [368]:
svc_scores = cross_val_score(svc_model, X_train, y_train, cv=30)


In [None]:
svc_scores.mean()

0.8283524904214559

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
plt.plot([1]*10, svm_scores, ".")
plt.plot([2]*10, forest_scores, ".")
plt.boxplot([svm_scores, forest_scores], labels=("SVM","Random Forest"))
plt.ylabel("Accuracy", fontsize=14)
plt.show()