# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Loading the CSV data into DataFrames

In [2]:
train_csv = pd.read_csv('train.csv')
test_csv = pd.read_csv('test.csv')
train_csv.head()

# Data cleaning and preprocessing

## The PassngerID column is not needed for building the model. This is because the PassengerID only serves as a unique identifier for each passenger and has no bearing whether or not they will live or die.

In [3]:
train_csv = train_csv.drop(columns = "PassengerId")
test_csv = test_csv.drop(columns = "PassengerId")
train_csv.head()

## The Ticket column also holds no importance, as it is also simply a unique identifier for each passenger and cannot properly give any statistic as to whether or not the passenger will live or die. It is also very difficult to process the data within the column, as it is a mix of characters and integers, with special cases having 3 tickets, thus more data to normalize.

In [4]:
train_csv = train_csv.drop(columns = "Ticket")
test_csv = test_csv.drop(columns = "Ticket")
train_csv.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


## The last column we decided to remove is the Cabin column. While we believe that it might give a better idea in determining who lives or not, it is also mostly populated with empty rows. Because of this, we cannot simply fill in empty rows, as this will give erroneus results not in line with the given data set. Rather than normalizing the data, we will simply remove it.

In [5]:
train_csv = train_csv.drop("Cabin", axis = 1)
test_csv = test_csv.drop("Cabin", axis = 1)
train_csv.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


## After removing all columns that are irrelevant/difficult to process, we then try to normalize our data and fill missing data with relevant data.

In [42]:
train_csv['Sex'] = train_csv['Sex'].replace('male', 1)
train_csv['Sex'] = train_csv['Sex'].replace('female', 2)

test_csv['Sex'] = test_csv['Sex'].replace('male', 1)
test_csv['Sex'] = test_csv['Sex'].replace('female', 2)

train_csv['Embarked'] = train_csv['Embarked'].replace('S', 1)
train_csv['Embarked'] = train_csv['Embarked'].replace('C', 2)
train_csv['Embarked'] = train_csv['Embarked'].replace('Q', 3)

test_csv['Embarked'] = test_csv['Embarked'].replace('S', 1)
test_csv['Embarked'] = test_csv['Embarked'].replace('C', 2)
test_csv['Embarked'] = test_csv['Embarked'].replace('Q', 3)

train_csv["Age"] = pd.to_numeric(train_csv["Age"], downcast = 'integer')
test_csv["Age"] = pd.to_numeric(test_csv["Age"], downcast = 'integer')
train_age_filler = train_csv["Age"].mean()
test_age_filler = test_csv["Age"].mean()


embarked_choices = [1, 2, 3]

train_fill_values = {'Age': train_age_filler, 'Embarked': random.choice(embarked_choices)}
test_fill_values = {'Age': test_age_filler, 'Embarked': random.choice(embarked_choices)}

train_csv = train_csv.fillna(train_fill_values)
test_csv = test_csv.fillna(test_fill_values)

train_csv.head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,2
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,1
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,1
5,0,3,"Moran, Mr. James",0,29.699118,0,0,8.4583,3
6,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,51.8625,1
7,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,21.075,1
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,11.1333,1
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,30.0708,2


## Now that all the empty cells are filled and all the data has been normalized, we can now proceed to creating the models.

# Creation of Learning Models
