In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [None]:
plt.rc("figure", dpi=300, figsize=(9,3))

<h1 align="center">Titanic Survival Estimation via Naïve Bayes</h1>

    Angela Cao
    Shane McQuarrie
    DRP Fall 2020

## Contents

1. Introduction: The Titanic Problem
2. Visualizing the Data
3. The Naïve Bayes Algorithm
4. Testing the Algorithm

## Introduction: The Titanic Problem

**TEXT HERE INTRODUCING THE PROBLEM** (where does this data come from? what is our objective? How will we do it?)

On April 15, 1912, the RMS Titanic sunk leaving about 1496 people dead. Many of the victims (and survivors) shared certain characteristics such as class (divided into 3 classes based on social status), sex, age, etc. Thus, the goal of the project is to investigate common factors among survivors and victims of the Titanic sinking. 



[This is how you do a link](www.example.com)

In [None]:
# Read the data from file.
titanic_original = pd.read_csv("titanic.csv")
titanic_original.sample(5)

In [None]:
# Extract the relevant columns.
titanic = titanic_original[["Survived", "Sex", "Age", "Pclass", "Fare"]]
titanic.sample(5)

**Write a little about why we're keeping these columns**

In [None]:
# Other preprocessing step: replace NaN age values (and other NaN values).

## Visualizing the Data

**Principles to follow here**:
- Tell a story. That means only use tables/visualizations that serve the story.
    - How many people survived?
    - How does survival appear to depend on the other factors (subgroups?)
- Visualize single distributions with histograms / kernel density estimate (`kind="kde"`)
- Label everything!


Ideas for visualizations
- Proportion of people who died
- Distribution of ages?
- Distribution of Fare?
- Proportion of males / females
- Any of these grouped by survival

(go see what we did in the Pandas 3 lab)

In [None]:
titanic.groupby(["Survived", "Sex"]).mean()

Wherever you have a table, explain the table beneath (write a caption).

In [None]:
titanic.groupby("Survived").count()["Age"].plot(kind="barh")
plt.show()

In [None]:
titanic["Age"].plot(kind="hist", bins=80)
plt.show()

Write about it. "This shows the distribution of the ages of the passengers. You can see that there were a lot of middle-aged people in their 20's and 30's and a good chunk of children...."

In [None]:
titanic.describe()

In [None]:
titanic["Age"].isnull()

In [None]:
titanic.loc[titanic["Age"].isnull()]

In [None]:
titanic["Age"].median()

In [None]:
titanic.loc[titanic["Age"].isnull(), "Age"] = titanic["Age"].median()

In [None]:
titanic = titanic[:1309]

In [None]:
plt.hist(titanic["Age"])

In [None]:
plt.hist(titanic["Fare"])

In [None]:
groups = titanic.groupby("Pclass")

In [None]:
# These almost look nice, but we have to be careful not compare apples and oranges.
groups.boxplot(grid=False)

In [None]:
groups.plot(kind="hist", y="Fare")

In [None]:
titanic.plot(kind='box', y=['Age'], vert=False)
titanic.plot(kind='box', y=['Fare'], vert=False)

In [None]:
titanic.plot(kind='scatter', x='Age', y='Fare', alpha=.2)

In [None]:
first_class = titanic[titanic['Pclass'] == 1.0]
first_class.plot(kind='box', y=['Age'], vert=False)
first_class.plot(kind='box', y=['Fare'], vert=False)
first_class.plot(kind='scatter', x='Age', y='Fare')

## The Naïve Bayes Algorithm

Explanation of the algorithm here :D

- Bayes' rule
- What it means in our context (what is $A$ and what is $B$)
- How do we use it to make a classifier?

$$
P(A|B) = \frac{P(B|A)P(A)}{P(B)}
$$



#### Implementation

In [None]:
class TitanicNaiveBayes:
    """
    """    
    def fit(self, data, labels):
        """
        Parameters
        ----------
        data : pd.DataFrame
            Data to train on.
        """
        for i in range(0, 2):
            total = 0
            males = 0
            mean = 0.0
            ages = []
            std = 0.0
            for k in range(0, len(data)):
                status = labels[k]
                if status == i:
                    total += 1
                    if data.iloc[k][0] == 'male':
                        males += 1
                    mean += data.iloc[k][1]
                    ages.append(data.iloc[k][1])
            probability = (total * 1.0) / len(data)
            males = (males * 1.0) / (total * 1.0)
            mean = (mean * 1.0) / (total * 1.0)
            for k in range(0, len(ages)):
                std += pow((ages[k] - mean), 2)
                std = std / (total * 1.0)
                std = sqrt(std)
            print("Group "+str(i))
            print("Probability = "+str(probability))
            print("Mean = "+str(mean))
            print("Standard Deviation = "+str(std))
        return self

    def predict(self, data):
        """
        Parameters
        ----------
        data: pd.DataFrame
            Data to train on.
        """
        denom = 0
        prob_group = [0.0, 0.0]
        for i in range(0, 2):
            females = 0
            males = 0
            mean = 0.0
            ages = []
            std = 0.0
            total = 0
            for k in range(0, len(data)):
                status = labels[k]
                if status == i:
                    total += 1
                    if data.iloc[k][0] == 'male':
                        males += 1
                    else:
                        females += 1
                    mean += data.iloc[k][1]
                    ages.append(data.iloc[k][1])
            females = (females * 1.0) / (total * 1.0)
            males = (males * 1.0) / (total * 1.0)
            mean = (mean * 1.0) / (total * 1.0)
            for k in range(0, len(ages)):
                std += pow((ages[k] - mean), 2)
                std = std / (total * 1.0)
                std = sqrt(std)
            ### idk about the rest lol
        pass

## Applying the Algorithm to the Problem

In [None]:
# Are these variables correlated?
titanic.drop("Survived", axis=1).corr()

In [None]:
data = titanic.drop("Survived", axis=1)
labels = titanic["Survived"]

In [None]:
# Will need to split the data into training / testing groups for validation