# Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load in Data

In [2]:
Titanic = sns.load_dataset('titanic')

In [3]:
Titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# Data Wrangling

### Drop any variables that are redundant and will add to multicollinearity.

In [4]:
Titanic1= Titanic.drop('fare', axis= 1)

In [5]:
Titanic1

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,C,First,man,True,C,Cherbourg,yes,True


In [6]:
Titanic2= Titanic1.drop('adult_male', axis= 1)

In [7]:
Titanic2

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,S,Third,man,,Southampton,no,False
1,1,1,female,38.0,1,0,C,First,woman,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,S,Third,woman,,Southampton,yes,True
3,1,1,female,35.0,1,0,S,First,woman,C,Southampton,yes,False
4,0,3,male,35.0,0,0,S,Third,man,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,S,Second,man,,Southampton,no,True
887,1,1,female,19.0,0,0,S,First,woman,B,Southampton,yes,True
888,0,3,female,,1,2,S,Third,woman,,Southampton,no,False
889,1,1,male,26.0,0,0,C,First,man,C,Cherbourg,yes,True


In [8]:
Titanic3= Titanic2.drop('alone', axis= 1)

In [9]:
Titanic3

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,class,who,deck,embark_town,alive
0,0,3,male,22.0,1,0,S,Third,man,,Southampton,no
1,1,1,female,38.0,1,0,C,First,woman,C,Cherbourg,yes
2,1,3,female,26.0,0,0,S,Third,woman,,Southampton,yes
3,1,1,female,35.0,1,0,S,First,woman,C,Southampton,yes
4,0,3,male,35.0,0,0,S,Third,man,,Southampton,no
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,S,Second,man,,Southampton,no
887,1,1,female,19.0,0,0,S,First,woman,B,Southampton,yes
888,0,3,female,,1,2,S,Third,woman,,Southampton,no
889,1,1,male,26.0,0,0,C,First,man,C,Cherbourg,yes


In [10]:
Titanic.pclass.value_counts() 

3    491
1    216
2    184
Name: pclass, dtype: int64

In [11]:
Titanic.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [13]:
Titanic.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

#Embark_town and embarked are the basically the same thing will drop embarked

In [14]:
Titanic4= Titanic3.drop('embarked', axis= 1)

In [15]:
Titanic4

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,class,who,deck,embark_town,alive
0,0,3,male,22.0,1,0,Third,man,,Southampton,no
1,1,1,female,38.0,1,0,First,woman,C,Cherbourg,yes
2,1,3,female,26.0,0,0,Third,woman,,Southampton,yes
3,1,1,female,35.0,1,0,First,woman,C,Southampton,yes
4,0,3,male,35.0,0,0,Third,man,,Southampton,no
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,Second,man,,Southampton,no
887,1,1,female,19.0,0,0,First,woman,B,Southampton,yes
888,0,3,female,,1,2,Third,woman,,Southampton,no
889,1,1,male,26.0,0,0,First,man,C,Cherbourg,yes


In [16]:
Titanic["class"].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

#Pclass and class are basically the same things will drop pclass 

In [17]:
Titanic5= Titanic4.drop('pclass', axis= 1)

In [18]:
Titanic5

Unnamed: 0,survived,sex,age,sibsp,parch,class,who,deck,embark_town,alive
0,0,male,22.0,1,0,Third,man,,Southampton,no
1,1,female,38.0,1,0,First,woman,C,Cherbourg,yes
2,1,female,26.0,0,0,Third,woman,,Southampton,yes
3,1,female,35.0,1,0,First,woman,C,Southampton,yes
4,0,male,35.0,0,0,Third,man,,Southampton,no
...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,Second,man,,Southampton,no
887,1,female,19.0,0,0,First,woman,B,Southampton,yes
888,0,female,,1,2,Third,woman,,Southampton,no
889,1,male,26.0,0,0,First,man,C,Cherbourg,yes


### Recode string data

In [25]:
Titanic5.sex.value_counts() 

male      577
female    314
Name: sex, dtype: int64

In [28]:
def sex(series):
    if series == "male":
        return "0"
    if series == "female":
        return "1"
Titanic5["sexR"]= Titanic5["sex"].apply(sex)

In [24]:
Titanic5["class"].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

In [29]:
def class1(series):
    if series == "First":
        return "0"
    if series == "Second":
        return "1"
    if series == "Third":
        return "2"
Titanic5["classR"]= Titanic5["class"].apply(class1)

In [23]:
Titanic5.who.value_counts()

man      537
woman    271
child     83
Name: who, dtype: int64

In [30]:
def who(series):
    if series == "man":
        return "0"
    if series == "woman":
        return "1"
    if series == "child":
        return "2"
Titanic5["whoR"]=Titanic5["who"].apply(who)

In [31]:
Titanic5

Unnamed: 0,survived,sex,age,sibsp,parch,class,who,deck,embark_town,alive,sexR,classR,whoR
0,0,male,22.0,1,0,Third,man,,Southampton,no,0,2,0
1,1,female,38.0,1,0,First,woman,C,Cherbourg,yes,1,0,1
2,1,female,26.0,0,0,Third,woman,,Southampton,yes,1,2,1
3,1,female,35.0,1,0,First,woman,C,Southampton,yes,1,0,1
4,0,male,35.0,0,0,Third,man,,Southampton,no,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,Second,man,,Southampton,no,0,1,0
887,1,female,19.0,0,0,First,woman,B,Southampton,yes,1,0,1
888,0,female,,1,2,Third,woman,,Southampton,no,1,2,1
889,1,male,26.0,0,0,First,man,C,Cherbourg,yes,0,0,0


In [32]:
Titanic5.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [33]:
Titanic5.alive.value_counts()

no     549
yes    342
Name: alive, dtype: int64

In [50]:
Titanic5

Unnamed: 0,survived,sex,age,sibsp,parch,class,who,deck,embark_town,alive,sexR,classR,whoR
0,0,male,22.0,1,0,Third,man,,Southampton,no,0,2,0
1,1,female,38.0,1,0,First,woman,C,Cherbourg,yes,1,0,1
2,1,female,26.0,0,0,Third,woman,,Southampton,yes,1,2,1
3,1,female,35.0,1,0,First,woman,C,Southampton,yes,1,0,1
4,0,male,35.0,0,0,Third,man,,Southampton,no,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,Second,man,,Southampton,no,0,1,0
887,1,female,19.0,0,0,First,woman,B,Southampton,yes,1,0,1
888,0,female,,1,2,Third,woman,,Southampton,no,1,2,1
889,1,male,26.0,0,0,First,man,C,Cherbourg,yes,0,0,0


In [51]:
Titanic5= Titanic5.drop('alive', axis= 1)

In [52]:
Titanic5

Unnamed: 0,survived,sex,age,sibsp,parch,class,who,deck,embark_town,sexR,classR,whoR
0,0,male,22.0,1,0,Third,man,,Southampton,0,2,0
1,1,female,38.0,1,0,First,woman,C,Cherbourg,1,0,1
2,1,female,26.0,0,0,Third,woman,,Southampton,1,2,1
3,1,female,35.0,1,0,First,woman,C,Southampton,1,0,1
4,0,male,35.0,0,0,Third,man,,Southampton,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,Second,man,,Southampton,0,1,0
887,1,female,19.0,0,0,First,woman,B,Southampton,1,0,1
888,0,female,,1,2,Third,woman,,Southampton,1,2,1
889,1,male,26.0,0,0,First,man,C,Cherbourg,0,0,0


In [53]:
Titanic5.deck.value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64

In [54]:
def deck(series):
    if series == "C":
        return "0"
    if series == "B":
        return "1"
    if series == "D":
        return "2"
    if series == "E":
        return "3"
    if series == "A":
        return "4"
    if series == "F":
        return "5"
    if series == "G":
        return "6"
Titanic5["deckR"]= Titanic5["deck"].apply(deck)

In [55]:
Titanic5

Unnamed: 0,survived,sex,age,sibsp,parch,class,who,deck,embark_town,sexR,classR,whoR,deckR
0,0,male,22.0,1,0,Third,man,,Southampton,0,2,0,
1,1,female,38.0,1,0,First,woman,C,Cherbourg,1,0,1,0
2,1,female,26.0,0,0,Third,woman,,Southampton,1,2,1,
3,1,female,35.0,1,0,First,woman,C,Southampton,1,0,1,0
4,0,male,35.0,0,0,Third,man,,Southampton,0,2,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,Second,man,,Southampton,0,1,0,
887,1,female,19.0,0,0,First,woman,B,Southampton,1,0,1,1
888,0,female,,1,2,Third,woman,,Southampton,1,2,1,
889,1,male,26.0,0,0,First,man,C,Cherbourg,0,0,0,0


In [56]:
Titanic5.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [57]:
def town(series):
    if series == "Southampton":
        return "0"
    if series == "Cherbourg":
        return "1"
    if series == "Queenstown":
        return "2"
Titanic5["embark_townR"]= Titanic5["embark_town"].apply(town)

In [58]:
Titanic5

Unnamed: 0,survived,sex,age,sibsp,parch,class,who,deck,embark_town,sexR,classR,whoR,deckR,embark_townR
0,0,male,22.0,1,0,Third,man,,Southampton,0,2,0,,0
1,1,female,38.0,1,0,First,woman,C,Cherbourg,1,0,1,0,1
2,1,female,26.0,0,0,Third,woman,,Southampton,1,2,1,,0
3,1,female,35.0,1,0,First,woman,C,Southampton,1,0,1,0,0
4,0,male,35.0,0,0,Third,man,,Southampton,0,2,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,Second,man,,Southampton,0,1,0,,0
887,1,female,19.0,0,0,First,woman,B,Southampton,1,0,1,1,0
888,0,female,,1,2,Third,woman,,Southampton,1,2,1,,0
889,1,male,26.0,0,0,First,man,C,Cherbourg,0,0,0,0,1


### Making sure the data is in the right format 

In [59]:
Titanic5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   survived      891 non-null    int64   
 1   sex           891 non-null    object  
 2   age           714 non-null    float64 
 3   sibsp         891 non-null    int64   
 4   parch         891 non-null    int64   
 5   class         891 non-null    category
 6   who           891 non-null    object  
 7   deck          203 non-null    category
 8   embark_town   889 non-null    object  
 9   sexR          891 non-null    object  
 10  classR        891 non-null    category
 11  whoR          891 non-null    object  
 12  deckR         203 non-null    category
 13  embark_townR  889 non-null    object  
dtypes: category(4), float64(1), int64(3), object(6)
memory usage: 74.2+ KB


In [61]:
Titanic5['deckR']= pd.to_numeric(Titanic5['deckR'], errors='coerce')

In [62]:
Titanic5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   survived      891 non-null    int64   
 1   sex           891 non-null    object  
 2   age           714 non-null    float64 
 3   sibsp         891 non-null    int64   
 4   parch         891 non-null    int64   
 5   class         891 non-null    category
 6   who           891 non-null    object  
 7   deck          203 non-null    category
 8   embark_town   889 non-null    object  
 9   sexR          891 non-null    object  
 10  classR        891 non-null    category
 11  whoR          891 non-null    object  
 12  deckR         203 non-null    float64 
 13  embark_townR  889 non-null    object  
dtypes: category(3), float64(2), int64(3), object(6)
memory usage: 79.9+ KB


In [63]:
Titanic5

Unnamed: 0,survived,sex,age,sibsp,parch,class,who,deck,embark_town,sexR,classR,whoR,deckR,embark_townR
0,0,male,22.0,1,0,Third,man,,Southampton,0,2,0,,0
1,1,female,38.0,1,0,First,woman,C,Cherbourg,1,0,1,0.0,1
2,1,female,26.0,0,0,Third,woman,,Southampton,1,2,1,,0
3,1,female,35.0,1,0,First,woman,C,Southampton,1,0,1,0.0,0
4,0,male,35.0,0,0,Third,man,,Southampton,0,2,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,Second,man,,Southampton,0,1,0,,0
887,1,female,19.0,0,0,First,woman,B,Southampton,1,0,1,1.0,0
888,0,female,,1,2,Third,woman,,Southampton,1,2,1,,0
889,1,male,26.0,0,0,First,man,C,Cherbourg,0,0,0,0.0,1


# will try dropping missing data 

In [64]:
Titanic5.dropna(inplace=True)

In [65]:
Titanic5

Unnamed: 0,survived,sex,age,sibsp,parch,class,who,deck,embark_town,sexR,classR,whoR,deckR,embark_townR
1,1,female,38.0,1,0,First,woman,C,Cherbourg,1,0,1,0.0,1
3,1,female,35.0,1,0,First,woman,C,Southampton,1,0,1,0.0,0
6,0,male,54.0,0,0,First,man,E,Southampton,0,0,0,3.0,0
10,1,female,4.0,1,1,Third,child,G,Southampton,1,2,2,6.0,0
11,1,female,58.0,0,0,First,woman,C,Southampton,1,0,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,female,47.0,1,1,First,woman,D,Southampton,1,0,1,2.0,0
872,0,male,33.0,0,0,First,man,B,Southampton,0,0,0,1.0,0
879,1,female,56.0,0,1,First,woman,C,Cherbourg,1,0,1,0.0,1
887,1,female,19.0,0,0,First,woman,B,Southampton,1,0,1,1.0,0


# Identitfy x and y 

In [66]:
x = Titanic5[['sexR','age', 'sibsp', 'parch', 'classR','whoR','deckR','embark_townR']]
y = Titanic5['survived']

# Train Test Split 

In [67]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

In [68]:
decisionTree = DecisionTreeClassifier()
decisionTree.fit(x_train, y_train)

# Assess the Model

In [69]:
treePredictions = decisionTree.predict(x_test)

# Reading the Confusion Matrix

In [70]:
print(confusion_matrix(y_test, treePredictions))

[[ 9  9]
 [ 7 30]]


In [71]:
print(classification_report(y_test, treePredictions))

              precision    recall  f1-score   support

           0       0.56      0.50      0.53        18
           1       0.77      0.81      0.79        37

    accuracy                           0.71        55
   macro avg       0.67      0.66      0.66        55
weighted avg       0.70      0.71      0.70        55



The weighted average tells us that 70% of the time the predictions we make are accurate which is not too bad. We do very well in predicting who survived not the best in who did not. 

# This was really hard on to the next :)

# PART 2 - Random Forest 

# Initial Random Forest Model

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [74]:
forest = RandomForestClassifier()
forest.fit(x_train, y_train)

In [75]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[ 8 10]
 [ 8 29]]
              precision    recall  f1-score   support

           0       0.50      0.44      0.47        18
           1       0.74      0.78      0.76        37

    accuracy                           0.67        55
   macro avg       0.62      0.61      0.62        55
weighted avg       0.66      0.67      0.67        55



Looking at this again we see that better predicition are made with those who survived then those who died and that is reflected with the precision, recall, and F1 score. The weighted average depects a decreased precentage from the decision tree model. Now, there is a 66% accuracy which is not that good. 

# Finally completed!! :)