# Load and clean data

In [20]:
from pathlib import Path
import pandas as pd
from pymongo import MongoClient
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import json

In [2]:
# Load Titanic csv file
titanic_data_to_load = Path("Resources/titanic.csv")


In [3]:
# Read Data File and store into Pandas DataFrames
titanicDF = pd.read_csv(titanic_data_to_load)
titanicDF.head()

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,S,United States,5547.0,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,S,United States,2673.0,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,S,United States,2673.0,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,S,England,2673.0,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,S,Norway,348125.0,7.13,0.0,0.0,yes


In [4]:
#cleaned data by dropping unwanted columns
titanicDF = titanicDF.drop(columns=["ticketno", "country"], axis=1)

titanicDF.head()

Unnamed: 0,name,gender,age,class,embarked,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,S,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,S,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,S,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,S,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,S,7.13,0.0,0.0,yes


In [5]:
# Count NaN values in all columns
nan_count = titanicDF.isna().sum()

print(nan_count)

name          0
gender        0
age           2
class         0
embarked      0
fare        916
sibsp       900
parch       900
survived      0
dtype: int64


In [6]:
# create new DF
no_nan_titanic_DF = pd.DataFrame(titanicDF)

# fill NaN with 0 for "age", fare", "sibsp", "parch"
no_nan_titanic_DF[['age', 'fare', 'sibsp', 'parch']] = titanicDF[['age', 'fare', 'sibsp', 'parch']].fillna('0')

In [7]:
nan_count = no_nan_titanic_DF.isna().sum()

print(nan_count)

name        0
gender      0
age         0
class       0
embarked    0
fare        0
sibsp       0
parch       0
survived    0
dtype: int64


In [9]:
embarkNameFixDF = no_nan_titanic_DF.replace({'embarked' : { 'C' : 'Cherbourg', 'Q' : 'Queenstown',
                                                           'S' : 'Southampton', 'B' : 'Unknown' }})
embarkNameFixDF

Unnamed: 0,name,gender,age,class,embarked,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,Southampton,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,Southampton,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,Southampton,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,Southampton,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,Southampton,7.13,0.0,0.0,yes
...,...,...,...,...,...,...,...,...,...
2202,"Wynn, Mr. Walter",male,41.0,deck crew,Unknown,0,0,0,yes
2203,"Yearsley, Mr. Harry",male,40.0,victualling crew,Southampton,0,0,0,yes
2204,"Young, Mr. Francis James",male,32.0,engineering crew,Southampton,0,0,0,no
2205,"Zanetti, Sig. Minio",male,20.0,restaurant staff,Southampton,0,0,0,no


In [10]:
embarkNameFixDF.dtypes

name        object
gender      object
age         object
class       object
embarked    object
fare        object
sibsp       object
parch       object
survived    object
dtype: object

In [11]:
# create new DF for final cleaned dataframe
cleanTitanic_DF = pd.DataFrame(embarkNameFixDF)

cleanTitanic_DF = embarkNameFixDF.astype({"age": float, "fare": float, "sibsp": float, "parch": float}, errors='raise')
cleanTitanic_DF.dtypes

name         object
gender       object
age         float64
class        object
embarked     object
fare        float64
sibsp       float64
parch       float64
survived     object
dtype: object

In [12]:
cleanTitanic_DF.head(5)

Unnamed: 0,name,gender,age,class,embarked,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,Southampton,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,Southampton,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,Southampton,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,Southampton,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,Southampton,7.13,0.0,0.0,yes


# Details for HTML Map

In [17]:
# insert lons and lats for each 'embark' site
cleanTitanic_DF.loc[cleanTitanic_DF['embarked']=='Southampton','longitude']='50.909698'
cleanTitanic_DF.loc[cleanTitanic_DF['embarked']=='Southampton','latitude']='-1.404351'

cleanTitanic_DF.loc[cleanTitanic_DF['embarked']=='Cherbourg','longitude']='49.6457'
cleanTitanic_DF.loc[cleanTitanic_DF['embarked']=='Cherbourg','latitude']='-1.6115'

cleanTitanic_DF.loc[cleanTitanic_DF['embarked']=='Queenstown','longitude']='51.851'
cleanTitanic_DF.loc[cleanTitanic_DF['embarked']=='Queenstown','latitude']='-8.2967'

cleanTitanic_DF

Unnamed: 0,name,gender,age,class,embarked,fare,sibsp,parch,survived,longitude,latitude
0,"Abbing, Mr. Anthony",male,42.0,3rd,Southampton,7.11,0.0,0.0,no,50.909698,-1.404351
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,Southampton,20.05,0.0,2.0,no,50.909698,-1.404351
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,Southampton,20.05,1.0,1.0,no,50.909698,-1.404351
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,Southampton,20.05,1.0,1.0,yes,50.909698,-1.404351
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,Southampton,7.13,0.0,0.0,yes,50.909698,-1.404351
...,...,...,...,...,...,...,...,...,...,...,...
2202,"Wynn, Mr. Walter",male,41.0,deck crew,Unknown,0.00,0.0,0.0,yes,,
2203,"Yearsley, Mr. Harry",male,40.0,victualling crew,Southampton,0.00,0.0,0.0,yes,50.909698,-1.404351
2204,"Young, Mr. Francis James",male,32.0,engineering crew,Southampton,0.00,0.0,0.0,no,50.909698,-1.404351
2205,"Zanetti, Sig. Minio",male,20.0,restaurant staff,Southampton,0.00,0.0,0.0,no,50.909698,-1.404351


In [18]:
# get number of loaded passengers per embarked dock (C = Cherbourg; Q = Queenstown; S = Southampton; B = Unknown)
embarked_dock_counts_df = cleanTitanic_DF['embarked']
embarked_dock_counts_df.value_counts()

embarked
Southampton    1616
Cherbourg       271
Unknown         197
Queenstown      123
Name: count, dtype: int64

In [37]:
# create copy of data
working_copy = cleanTitanic_DF.copy()
x = working_copy.to_dict("records")
#pprint.pprint(x)

In [38]:
# save dataframe to formatted json file
with open("cleanTitanic_DF.json", "w") as outfile:
    json.dump(x, outfile, indent=4, sort_keys=False)