# Titanic

In [102]:
#Pandas is for dataframe manipulation
import pandas as pd 
#Scikit learn is used to build the model. It has various libraries like linear regression, decision tree etc.
from sklearn.linear_model import LogisticRegression
import numpy as np 
# import matplotlib.pyplot as plt #I did not use this
from sklearn.preprocessing import StandardScaler #used for standardization (sometimes called Z-score normalization)
# from sklearn.model_selection import train_test_split # Do not need this with the multiple datasets kaggle has given

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score


In [103]:
#load the data into dataframes
#data retrieved from kaggle competitions
train = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
gender_submission = pd.read_csv("gender_submission.csv")

In [104]:
#will focus on train data for now. 
gender_submission.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [105]:
#Merge the gender_submission database and the test_df database since they were given by kaggle
#but contain incomplete data in comparision to the train dataframe. This merge consolidates for further manipulation
test = pd.merge(test_df, gender_submission, on = "PassengerId", how="inner")
test.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


Going to need to apply any column changes to test and train. gender_submissions will be what we apply at the end with passenger IDs matching

In [106]:
# We need to turn the Pclass into its own columns in order to get a better prediction on who will be in what class
train['FirstClass'] = train.Pclass.apply(lambda p: 1 if p == 1 else 0)
test['FirstClass'] = test.Pclass.apply(lambda p: 1 if p == 1 else 0)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [107]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,FirstClass
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,0


In [108]:
# Now the same thing for second class and then again for third class people. 
# We need to turn the Pclass into its own columns in order to get a better prediction on who will be in what class
train['SecondClass'] = train.Pclass.apply( lambda p: 1 if p == 2 else 0)
test['SecondClass'] = test.Pclass.apply( lambda p: 1 if p == 2 else 0)
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0,1


In [109]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,FirstClass,SecondClass
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,0,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,0,0


In [110]:
train['ThirdClass'] = train.Pclass.apply( lambda p: 1 if p == 3 else 0)
test['ThirdClass'] = test.Pclass.apply( lambda p: 1 if p == 3 else 0)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass,ThirdClass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1


In [111]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,FirstClass,SecondClass,ThirdClass
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,0,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,0,0,1


We know from stories of Titanic that women and children  can be saved in a life raft, but what about men? 
I think we should address gender and age next as it may help our model. 

In [112]:
# Let's make the sex cloumn into a binary column
train['Sex_binary'] = train.Sex.map({"male": 0, "female": 1}) 
test['Sex_binary'] = test.Sex.map({"male": 0, "female": 1})
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass,ThirdClass,Sex_binary
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,1,0


I could of done this earlier, but in this data, I am not likely to use the following columns right now. I may need them later for further analysis but at present they are just cluttering up my dataset. Lets just drop them for now from each dataframe.

Pclass, Name, SibSp, Parch, Ticket, Fare, Cabin, Embarked

In [113]:
train.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,FirstClass,SecondClass,ThirdClass,Sex_binary
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658,0.034303,-8.6e-05,-0.029486,-0.042939
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,0.285904,0.093349,-0.322308,0.543351
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,-0.885924,-0.188432,0.916673,-0.1319
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,0.348941,0.006954,-0.312271,-0.093254
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,-0.054582,-0.055932,0.092548,0.114631
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,-0.017633,-0.000734,0.01579,0.245489
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,0.591711,-0.118557,-0.413333,0.182333
FirstClass,0.034303,0.285904,-0.885924,0.348941,-0.054582,-0.017633,0.591711,1.0,-0.288585,-0.626738,0.098013
SecondClass,-8.6e-05,0.093349,-0.188432,0.006954,-0.055932,-0.000734,-0.118557,-0.288585,1.0,-0.56521,0.064746
ThirdClass,-0.029486,-0.322308,0.916673,-0.312271,0.092548,0.01579,-0.413333,-0.626738,-0.56521,1.0,-0.137143


In [114]:
# columns_to_drop = ["Pclass", "Name", "Sex", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
columns_to_drop = ["Pclass", "Name", "Sex",  "Ticket", "Cabin", "Embarked"]

train = train.drop(columns_to_drop, axis = 1)
test = test.drop(columns_to_drop, axis = 1)
train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,FirstClass,SecondClass,ThirdClass,Sex_binary
0,1,0,22.0,1,0,7.25,0,0,1,0
1,2,1,38.0,1,0,71.2833,1,0,0,1
2,3,1,26.0,0,0,7.925,0,0,1,1
3,4,1,35.0,1,0,53.1,1,0,0,1
4,5,0,35.0,0,0,8.05,0,0,1,0


In [115]:
train.describe()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,FirstClass,SecondClass,ThirdClass,Sex_binary
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,29.699118,0.523008,0.381594,32.204208,0.242424,0.20651,0.551066,0.352413
std,257.353842,0.486592,14.526497,1.102743,0.806057,49.693429,0.42879,0.405028,0.497665,0.47799
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,20.125,0.0,0.0,7.9104,0.0,0.0,0.0,0.0
50%,446.0,0.0,28.0,0.0,0.0,14.4542,0.0,0.0,1.0,0.0
75%,668.5,1.0,38.0,1.0,0.0,31.0,0.0,0.0,1.0,1.0
max,891.0,1.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0


In [116]:
train["Age"].count()
# we have 714 rows with an age. So we have some missing data. We have 891 rows 

714

## Types of missing data:
- **Missing Completely at Random (MCAR):** No pattern to the missingness.
- **Missing at Random (MAR):** Missingness might be related to other observed variables, but not the missing value itself.
- **Missing Not at Random (MNAR):** The missingness depends on the value of the missing data itself.

## Strategies for Handling Missing Values

- **Listwise Deletion:** Remove rows containing missing values. Use with caution, especially if you have a large number of missing entries, as you could lose valuable information.
- **Pairwise Deletion:** Utilize all available data, but computations might involve different subsets of data.
- **Imputation:** Fill in missing values with estimated substitutes. **Common methods:**
- **Mean/Median Imputation:** Replace missing values with the mean/median of the column. Suitable for numerical data.
- **Mode Imputation:** Replace missing categorical values with the most frequent category.
- **Predictive Modeling:** Create a model to predict missing values based on other variables. This can be more sophisticated.


This is an exert summary from google's Gemini

In [117]:
#Let's fill in the ages with the mean of all ages.
train['Age'].fillna(value = round(train['Age'].mean()), inplace = True) #look up .fillna function
test['Age'].fillna(value = round(test['Age'].mean()), inplace = True) 
train["Age"].count() #now we have every row accounted for. 

891

In [118]:
train.describe() 

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,FirstClass,SecondClass,ThirdClass,Sex_binary
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,29.758889,0.523008,0.381594,32.204208,0.242424,0.20651,0.551066,0.352413
std,257.353842,0.486592,13.00257,1.102743,0.806057,49.693429,0.42879,0.405028,0.497665,0.47799
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0,0.0
50%,446.0,0.0,30.0,0.0,0.0,14.4542,0.0,0.0,1.0,0.0
75%,668.5,1.0,35.0,1.0,0.0,31.0,0.0,0.0,1.0,1.0
max,891.0,1.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0


# Prepare the data
 We were given two datasets so we will split them accordingly. WIth one dataset we would use the train_test_split function from the model selection of sklearn. 

In [119]:
#I want to focus on training a model on Age, Sex_binary, FirstClass, SecondClass, ThirdClass, "SibSp", "Parch", "Fare"
#The goal is to predict whether or not the user survived based on this. 
train_features = train[["Age", "Sex_binary", "FirstClass", "SecondClass", "ThirdClass"]]
train_labels = train["Survived"]
test_features = test[["Age", "Sex_binary", "FirstClass", "SecondClass", "ThirdClass"]]
test_labels = gender_submission["Survived"]
# gender_submission.head()

Now we will normalize the data in preparation for loading and training the model. This allows all weighst to be evenly distriubted

In [120]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

# Start Logistic Regression 

In [121]:
model = LogisticRegression()
model.fit(train_features, train_labels)

In [135]:
# print(model.score(train_features, train_labels)) #I Switched to the metric module for accuracy_score
y_predict = model.predict(test_features)

LR_model_acc_score = accuracy_score(test_labels, y_predict)

print(f"Accuracy: {LR_model_acc_score}")

Accuracy: 0.9641148325358851


In [136]:
#lets see the coefficents -  Age, Sex_binary, FirstClass, SecondClass, ThirdClass 
print(model.coef_)


[[-0.42603339  1.23609343  0.54127538  0.062455   -0.51719478]]


Coefficients represent the relationship between each feature (variable) and the log-odds of the target event

A positive coefficient means that an increase in the feature's value is associated with a higher likelihood of survival.

A negative coefficient means that an increase in the feature's value is associated with a lower likelihood of survival.

- Age = -0.42603339 
- Sex_binary = 1.23609343      
- FirstClass = 0.54127538 
- SecondClass = 0.062455   
- ThirdClass = -0.51719478


In [137]:
#I remember when I did this 2 years ago, we used Jack and Rose then ourselves to make predictions on the model and we mad ethem in a np.array
Jack = np.array([20.0, 0.0, 0.0, 0.0, 1.0])
Rose = np.array([17.0, 1.0, 1.0, 0.0, 0.0])
Dom = np.array([29.0,  0.0, 0.0, 1.0, 0.0])

In [138]:
passenger_predict = np.array([Jack, Rose, Dom])

In [139]:
passenger_predict = scaler.transform(passenger_predict)



That warning again?..

In [140]:
#prediction time! My favorite part
# Make survival predictions!
print(model.predict(passenger_predict)) #This will print a 1 or 0 for surivied or did not survive 
print(model.predict_proba(passenger_predict)) #this will give us how likely for each option

[0 1 0]
[[0.88542238 0.11457762]
 [0.05002647 0.94997353]
 [0.75873431 0.24126569]]


- Jack had an 88.5% of NOT surviving based on the data. 
- Rose had a 95% chance of surviving. 
- Dom would of had a 75.9% chance of NOT surviving.

# Start Decision Tree model

We have the option of using DecisionTreeClassifier or DecisionTreeRegressor.

We know that we are classifying whether or not someone survived so we should use the DecisionTreeClassifier. 

In [141]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [142]:
tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=3)

In [143]:
tree_model.fit(train_features, train_labels)

In [144]:
y_predict = tree_model.predict(test_features)

tree_model_acc_score = accuracy_score(test_labels, y_predict)

print(f"Accuracy: {tree_model_acc_score}")

Accuracy: 0.9569377990430622


In [145]:
# passenger_predict
print(tree_model.predict(passenger_predict))
print(tree_model.predict_proba(passenger_predict))

[0 1 0]
[[0.88941176 0.11058824]
 [0.04761905 0.95238095]
 [0.88941176 0.11058824]]


We will interpret and comparer all results at the end

# Start Random Forest Classifier

In [146]:
from sklearn.ensemble import RandomForestClassifier

In [149]:
RFC_model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=4)

In [150]:
RFC_model.fit(train_features, train_labels)

In [153]:
y_predict_RFC = RFC_model.predict(test_features)

In [154]:
#Now it should of predicted if it thinks the people in the test_features dataset survived. Lets compare that to our information of their actual survival rates
print("Accuracy:", accuracy_score(test_labels, y_predict_RFC))

Accuracy: 0.9593301435406698


This was pretty accurate - 96%

# Next we will go over Naive Bayes for Classification


In [155]:
from sklearn.naive_bayes import GaussianNB

In [157]:
#We should not need to do much for this one
nb_model = GaussianNB()

In [158]:
nb_model.fit(train_features, train_labels)

In [161]:
y_predict_NB = nb_model.predict(test_features)

NB_acc_score = accuracy_score(test_labels, y_predict_NB)

print(f"Accuracy: {NB_acc_score}")

Accuracy: 0.8636363636363636
