In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore") 

# Input data files are available in the read-only "../input/" directory
# For example, running this will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



### Loading Data

We can start by loading the 'training data', then by printing the first 5 lines of data or 'head' to see what we are working with. Next, we are going to load the test data using the same method. This will later be used to test our machine learning model.

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

### Exploring the data

From the above we can see that the data available is made up of 12 columns:

1. PassengerID: Unique ID assigned to each passenger
2. Survived: Binary, 1 is for survived, 0 for not survived
3. Pclass: Ticket Class, 1 is 1st class, 2 is 2nd class, 3 is 3rd class
4. Name: Passenger name, also contains title
5. Sex: Male or Female
6. Age: Passengers age, "NaN" for values not recorded
7. SibSp: siblings or spouse for each passenger
8. Parch: Number of parents/children travelling for each passenger
9. Ticket: Ticket number
10. Fare: Money paid by each passenger for their ticket
11. Cabin: Cabin number, "NaN" for values not recorded
12. Embarked: Port from where passenger embarked

In [None]:
train_data.shape

We can also see from above that the train_data has a shape of (891,12), meaning we have 891 rows and 12 columns.

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
test_data.shape

Test data conatins the same columns minus the Survived column as this data is what we will use to test our model. Also this dataset only contains 418 rows, so less then half of our training dataset.

In [None]:
train_data.describe() #shows different values. Mean, count, ect..

In [None]:
train_data.describe(include=['O']) #Shows descriptive stats

In [None]:
train_data.info() #shows abit of info on the data

In [None]:
train_data.isnull().sum()

#### Key Notes:
- From the 891 rows, 577 are male meaning 314 are female
- Most passengers departed from port S at 644 people
- Some passengers shared ticket numbers and cabins. 4 people shared cabin B96 B98, and 7 people shared ticket 347082
- There are missing or null values. 177 missing age values, 687 missing cabin values, and 2 missing embarked values.

In [None]:
test_data.describe()

In [None]:
test_data.describe(include=['O'])

In [None]:
test_data.info()

In [None]:
test_data.isnull().sum()

#### Key Notes:
- This dataset is missing the Survived Column, this dataset is what we will be making predicitions one.
- There is null values present in this dataset as well. Age is missing 86 values, Cabin is missing 327 values, and 1 fare missing value.

### Relationships Between Features and Survival

We will see if we can find any strong relationships between the features and survival rates. First lets find the percentages of who survived and who didn't on the titanic, just to get an intial sense of what to expect.

In [None]:
not_survive = train_data[train_data['Survived'] == 0]
did_survive = train_data[train_data['Survived'] == 1]

print ("Didn't Survive: %i (%.1f%%)"%(len(not_survive), float(len(not_survive))/len(train_data)*100.0))
print ("Did Survive: %i (%.1f%%)"%(len(did_survive), float(len(did_survive))/len(train_data)*100.0))
print("Total: %i" %len(train_data))

#### Class (PClass) and Survival Rates

In [None]:
train_data.Pclass.value_counts()

In [None]:
Pclass_survived = train_data.groupby('Pclass').Survived.value_counts()
Pclass_survived

In [None]:
# plotting the pclass vs survived
Pclass_survived.unstack(level=0).plot(kind='bar', subplots=False)

##### Note: From this information higher class passengers seem to have a much higher survival chance

In [None]:
pclass_survived_avg = train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()
pclass_survived_avg

##### Note: Passengers with a higher class (1) have a higher average survival then those in lower classes (2 & 3)
#### Gender (Sex) and Survival Rates

In [None]:
train_data.Sex.value_counts()

In [None]:
sex_survive = train_data.groupby('Sex').Survived.value_counts()
sex_survive

In [None]:
sex_survive.unstack(level=0).plot(kind='bar')

##### Note: Female passengers had a much higher survival rate then their male counterparts 
### Gender Breakdown of Classes

Below I will compare classes against gender to see if one class is made up of more of one gender then the other.

In [None]:
gender_breakdown = train_data.groupby('Sex').Pclass.value_counts()
print (gender_breakdown)

In [None]:
gender_breakdown.unstack(level=0).plot(kind='bar')

In [None]:
sns.catplot(data=train_data, x="Pclass", y="Survived", hue="Sex", kind="violin", col="Embarked")

##### A Couple of Notes from the above:
- Females have a high survival chance in PClass 1 and 2, with most females passing away in PClass 3
- Males also have the highest survival rate if they were in PClass 1

### Embarked vs Survived
I will now have a look into the 'embarked' and 'survived' columns and how they relate to each other.

In [None]:
train_data.Embarked.value_counts()

In [None]:
train_data.groupby('Embarked').Survived.value_counts()

In [None]:
sns.barplot(x='Embarked', y='Survived', data=train_data)

It's clear that passengers who departed from port C have a higher survival percentage and is the only port where more passengers survived then passed away.

### Parch and survival

In [None]:
train_data.Parch.value_counts()

In [None]:
train_data.groupby('Parch').Survived.value_counts()

In [None]:
train_data[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean()

In [None]:
sns.barplot(x='Parch', y='Survived', ci=None, data=train_data)

### SibSp and Survival

In [None]:
train_data.SibSp.value_counts()

In [None]:
train_data.groupby('SibSp').Survived.value_counts()

In [None]:
train_data[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean()

In [None]:
sns.barplot(x='SibSp', y='Survived', ci=None, data=train_data)

### Age and Survival

In [None]:
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)

sns.violinplot(x="Embarked", y="Age", hue="Survived", data=train_data, split=True, ax=ax1)
sns.violinplot(x="Pclass", y="Age", hue="Survived", data=train_data, split=True, ax=ax2)
sns.violinplot(x="Sex", y="Age", hue="Survived", data=train_data, split=True, ax=ax3)

##### Notes:

- The majority of passengers were between the ages of 20 and 40
- Pclass 1 has fewer children and more elderly compared to PClass 2 and 3
- Almost all children in PClass 2 survived, as well as most children from PClass 3
- Of those who embarked from port Q only passengers between the ages of about 3 to 41 survived
- Females between the age of 18 and 40 have the highest survival rates and most Males between the ages of 0 to 16 survived

### Distrubition Plots based on Survivers sex

In [None]:
totalSurvived = train_data[train_data['Survived']==1]
totalNotSurvived = train_data[train_data['Survived']==0]

menSurvived = train_data[(train_data['Survived']==1) & (train_data['Sex']=="male")]
womenSurvived = train_data[(train_data['Survived']==1) & (train_data['Sex']=="female")]
menNotSurvived = train_data[(train_data['Survived']==0) & (train_data['Sex']=="male")]
womenNotSurvived = train_data[(train_data['Survived']==0) & (train_data['Sex']=="female")]

plt.figure(figsize=[15,5])
plt.subplot(111)
sns.distplot(totalSurvived['Age'].dropna().values, bins=range(0, 81, 1), kde=True, color='blue')
sns.distplot(totalNotSurvived['Age'].dropna().values, bins=range(0, 81, 1), kde=True, color='red', axlabel='Age')

In [None]:
plt.figure(figsize=[15,5])

plt.subplot(121)
sns.distplot(womenSurvived['Age'].dropna().values, bins=range(0, 81, 1), kde=True, color='blue')
sns.distplot(womenNotSurvived['Age'].dropna().values, bins=range(0, 81, 1), kde=True, color='red', axlabel='Female Age')

plt.subplot(122)
sns.distplot(menSurvived['Age'].dropna().values, bins=range(0, 81, 1), kde=True, color='blue')
sns.distplot(menNotSurvived['Age'].dropna().values, bins=range(0, 81, 1), kde=True, color='red', axlabel='Male Age')

##### Notes:
- Male passengers were more likely to survive if between the ages of 0 and 15
- Children below the age of 10 had a higher survival rate
- Females overall had a higher survival rate with those aged 20 to 40 and 50+ showing high chance of survival
- Men aged 16 and over had a quite low survival rate compared to other sexs and ages.

### Feature Extraction and Creation 

Here I will select the appropriate features to train our model, we will also creat features based on existing ones. 

In [None]:
 # combining train and test dataset
train_test_data = [train_data, test_data]

# extracting titles from Name column.
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.')

# displays head with new Title column added
train_data.head()

In [None]:
# Use's crosstab to highlight which title belongs to which gender
pd.crosstab(train_data['Title'], train_data['Sex'])

I am going to replace some of the lesser used titles with other as well as switch specific title to more common ones (Ex: 'Mlle' to 'Miss')

In [None]:
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# Will use to check if worked
train_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
sns.barplot(x='Title', y='Survived', ci=None, data=train_data)

For purposes apparent later we will assign titles a numerical form

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [None]:
train_data.head()

We will assign a numerical value for sex as well

In [None]:
sex_mapping = {"female": 1, "male": 0}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

In [None]:
train_data.head()


We will also replace all "NaN" embarked features with S as this is clearly the more dominate port the we will assign a numerical value to these as well

In [None]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
train_data.head()

In [None]:
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
for dataset in train_test_data:
    dataset["Embarked"] = dataset["Embarked"].map(embarked_mapping)
    
train_data.head()

We will then assign a random age to null age values using the mean age, then we will assign each age to an age grouping for our analysis.

In [None]:
for dataset in train_test_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    
train_data['AgeBand'] = pd.cut(train_data['Age'], 5)

print (train_data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean())

In [None]:
train_data.head()

In [None]:
for dataset in train_test_data:
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
    
train_data.head()

We will create a similar band for the fare feature

In [None]:
for dataset in train_test_data:
    dataset['Fare'] = dataset['Fare'].fillna(train_data['Fare'].median())

In [None]:
train_data['FareBand'] = pd.qcut(train_data['Fare'], 4)
print (train_data[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean())

In [None]:
for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
train_data.head()

Next we will combine the siblings and parents feature to get an idea of family sizes

In [None]:
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['SibSp'] +  dataset['Parch'] + 1

print (train_data[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

In [None]:
sns.barplot(x='FamilySize', y='Survived', ci=None, data=train_data)

##### Notes:
- Traveling alone (family size of 1) has a lower survival chance
- traveling with a family size of 2 to 4 has the highest likelyhood of survival

From this we can create an is alone feature to determine by how much travelling alone impacts survival.

In [None]:
for dataset in train_test_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
print (train_data[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())

Lets check if your Test and Train datasets both contain the new features

In [None]:
train_data.head()

In [None]:
test_data.head()

### Feature Selection

We will drop the unnecessary columns and features, keeping only the ones we need for our model.

In [None]:
features_drop = ['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'FamilySize']
train_data = train_data.drop(features_drop, axis=1)
test_data = test_data.drop(features_drop, axis=1)
train_data = train_data.drop(['PassengerId', 'AgeBand', 'FareBand'], axis=1)

In [None]:
train_data.head()

In [None]:
test_data.head()

### Training our models

Define training and testing datasets

In [None]:
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
X_test = test_data.drop('PassengerId', axis=1).copy()

X_train.shape, y_train.shape, X_test.shape
X_train.isnull().sum()


##### Models

We are going to run our data through the below models and whislt doing so we will test which model is most accurate.
- Logistic Regression
- Support Vector Machines (SVC)
- Linear SVC
- k-Nearest Neighbor (KNN)
- Decision Tree
- Random Forest
- Naive Bayes (GaussianNB)
- Perceptron
- Stochastic Gradient Descent (SGD)

In [None]:
# Importing Classifier Modules/Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

In [None]:
# Logisitic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred_log_reg = model.predict(X_test)
acc_log_reg = round( model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_log_reg) + '%')

In [None]:
# Support Vector Machines (SVC) Model
model = SVC()
model.fit(X_train, y_train)
y_pred_svc = model.predict(X_test)
acc_svc = round(model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_svc) + '%')

In [None]:
# Linear SVC Model
model = LinearSVC()
model.fit(X_train, y_train)
y_pred_linear_svc = model.predict(X_test)
acc_linear_svc = round(model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_linear_svc) + '%')

In [None]:
# k-Nearest Neighbor (KNN) Model
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(X_train, y_train)
y_pred_knn = model.predict(X_test)
acc_knn = round(model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_knn) + '%')

In [None]:
# Decision Tree Model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred_decision_tree = model.predict(X_test)
acc_decision_tree = round(model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_decision_tree) + '%')

In [None]:
# Random Forest Model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred_random_forest = model.predict(X_test)
acc_random_forest = round(model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_random_forest) + '%')

In [None]:
# Naive Bayes (GaussianNB) Model
model = GaussianNB()
model.fit(X_train, y_train)
y_pred_gnb = model.predict(X_test)
acc_gnb = round(model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_gnb) + '%')

In [None]:
# Perceptron Model
model = Perceptron(max_iter=5, tol=None)
model.fit(X_train, y_train)
y_pred_perceptron = model.predict(X_test)
acc_perceptron = round(model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_perceptron) + '%')


In [None]:
#Stochastic Gradient Descent (SGD) Model
model = SGDClassifier(max_iter=5, tol=None)
model.fit(X_train, y_train)
y_pred_sgd = model.predict(X_test)
acc_sgd = round(model.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_sgd) + '%')

##### Compare the Models

In [None]:
models = pd.DataFrame({'Model': ['LR', 'SVM', 'L-SVC','KNN', 'DTree', 'RF', 'NB', 'Perceptron', 'SGD'],
                       'Score': [acc_log_reg, acc_svc, acc_linear_svc,acc_knn,  acc_decision_tree, acc_random_forest, acc_gnb,acc_perceptron, acc_sgd]})

models = models.sort_values(by='Score', ascending=False)
models

### Creating my Submission

Based on the above model comparison we can see that the best results come from a Decession tree or a Random Forest Model. For my submission I will be picking the Decision Tree results

In [None]:
my_submission = pd.DataFrame({"PassengerId": test_data["PassengerId"],
                             "Survived": y_pred_decision_tree})

my_submission.to_csv('gender_submission.csv', index=False)