In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import re

pd.set_option('display.max_columns', 500)

In [89]:
# read in db
titanic = pd.read_csv('../data/Titanic-Dataset.csv')

In [90]:
# add colun detailing # of other people that shared ticket
ticket_count = pd.DataFrame(titanic['Ticket'].value_counts()).reset_index().rename({'index':'Ticket','Ticket':'ticket_count'}, axis=1)

titanic = titanic.merge(ticket_count, on='Ticket', how='left')

titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1


In [91]:
# Finding married couples travelling together and married women traveling alone
married_women = [x for x in titanic['Name'] if re.search('Mrs', x)]

# find husbands by changing title and remoing maiden name
husbands = [re.sub('Mrs', 'Mr', x) for x in married_women]
husbands = [re.sub(r'\s\(\D+\)', '', x) for x in husbands]

# differentiate if husbands are on board by matching lists to column
present_husbands_list = [x for x in titanic['Name'] if x in husbands]

absent_husbands_list = [x for x in husbands if x not in husbands]

# Tie husbands back to wives on ship by reversing Mr/Mrs and getting passenger ID
titanic_mrs = titanic[(titanic['Sex'] == 'female') & (titanic['Name'].str.contains('Mrs'))][['PassengerId','Name']]
titanic_mrs['Name'] = titanic_mrs['Name'].str.replace(r'\s\(\D+\)', '').str.replace('Mrs', 'Mr')
mrs_acc_id = titanic_mrs[titanic_mrs['Name'].isin(present_husbands_list)]['PassengerId'].tolist()

accompanied_wives_list = titanic[titanic['PassengerId'].isin(mrs_acc_id)]['Name'].tolist()

# checking to see if numbers are right
display(len(husbands))
display(len(present_husbands_list))
display(len(accompanied_wives_list))

  titanic_mrs['Name'] = titanic_mrs['Name'].str.replace(r'\s\(\D+\)', '').str.replace('Mrs', 'Mr')


129

40

40

In [92]:
# add column assigning whether spouse is on board
titanic.loc[titanic['Name'].isin(present_husbands_list) | titanic['Name'].isin(accompanied_wives_list), 'spouse_present'] = int(1)
titanic.loc[~titanic['Name'].isin(present_husbands_list) & ~titanic['Name'].isin(accompanied_wives_list), 'spouse_present'] = int(0)

In [93]:
# split cabin into letter and number columns
titanic['cabin_letter'] = titanic['Cabin'].str[0].fillna('none')
titanic['cabin_number'] = titanic['Cabin'].str.findall(r'[0-9]+').str[0].fillna(0).astype('int64')
# titanic['cabin_number'] = titanic['cabin_number'].str.findall(r'/d+')#.str[1].fillna(0).astype(int)

In [94]:
# Separated ticket out into initial text and number
titanic['ticket_letter'] = titanic['Ticket'].str.findall(r'\D+').str[0].fillna('none')
titanic['ticket_number'] = titanic['Ticket'].str.findall(r'\s*(\d+$)').str[0].fillna(0).astype('int')

In [95]:
# add colun assigning those who are traveling with sibling (differentiate from spouse) 
titanic.loc[(titanic['SibSp'] == 1) & (titanic['spouse_present'] == 0), 'sibling_present'] = int(1)
titanic['sibling_present'] = titanic['sibling_present'].fillna(0)

In [96]:
titanic.loc[titanic['Age'].isna(), 'Age'] = titanic['Age'].median()
titanic.loc[titanic['Embarked'].isna(), 'Embarked'] = 'unknown'

In [97]:
titanic_final = titanic.drop(columns=['Name','Ticket','Cabin'])

titanic_final.to_csv('../data/classification_dataset.csv')

In [70]:
titanic[titanic['Age'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_count,spouse_present,cabin_letter,cabin_number,ticket_letter,ticket_number,sibling_present


In [77]:
titanic_final[titanic_final['ticket_number'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,ticket_count,spouse_present,cabin_letter,cabin_number,ticket_letter,ticket_number,sibling_present


In [98]:
titanic_final.dtypes

PassengerId          int64
Survived            object
Pclass               int64
Sex                 object
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked            object
ticket_count         int64
spouse_present     float64
cabin_letter        object
cabin_number         int64
ticket_letter       object
ticket_number        int32
sibling_present    float64
dtype: object