### The following markdown performs an exploratory analysis of the Titanic dataset and implements a strategy to impute missing values in the Age column. 

##### Please note that for questions 5 and 6, while more advanced functions (such as crosstab) may have made the code more efficient, only indexing and functions that were a part of the class have been utlised for the purposes of the Assignment. Also, multiple datasets have been created, even though a single function could have been passed, faor readability.

In [1]:
# Import the relevant libraries

import pandas as pd
import os

##### Q1: Read the dataset correctly using pandas, using an appropriate column as index

In [2]:
directory = 'C:/Users/anups/Google Drive/TERM 2/Python/Assignments/Assignment_2/'  # Update as relevant

titanic = pd.read_csv(os.path.join(directory, 'train.csv'),
                      index_col='PassengerId')  # Id being the only unique item that is also easily indexable
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


##### Q2: Compute the percentage of survivors out of total number of passengers

In [3]:
print(round(titanic['Survived'].mean() * 100,
            2),
      'percent of people survived')  # Since mean for booleans is the proportion.

38.38 percent of people survived


##### Q3: Compute the percentage of men and women out of total number of passengers 

In [4]:
men_women_proportion = titanic.groupby('Sex').size() / titanic.index.size  # size counts the number of records in each group (that for index counts all records)

round(men_women_proportion * 100,
      2)  # rounded and * 100 for readability

Sex
female    35.24
male      64.76
dtype: float64

##### Q3: Compute the percentage of survivors by sex (i.e. the percentage of male passengers that survived and female passengers that survived) 

In [5]:
survivors_by_sex_total = titanic[['Survived', 'Sex']].groupby('Sex').mean()  # mean for boolean values in this case gives us the proportion for each sex

print(round(survivors_by_sex_total * 100,
            2))  # rounded and * 100 for readability

        Survived
Sex             
female     74.20
male       18.89


##### Q4: Compute the percentage of survivors that were men and women

In [6]:
# subset titanic for only those that survived

survivors_by_sex = titanic[titanic['Survived'] == 1].groupby('Sex').size() / titanic[titanic['Survived'] == 1].index.size

round(survivors_by_sex * 100,
      2)  # rounded and * 100 for readability

Sex
female    68.13
male      31.87
dtype: float64

##### Q5: Display in a 2 x 2 DataFrame the probability of being male/female and surviving/not surviving

In [7]:
probability_gender = survivors_by_sex_total.merge(pd.DataFrame(men_women_proportion),
                                                  on='Sex')

# men_women_proportion was orignially a series so needed to be transformed before a merge. Merge creates a new dataset that has the required columns

probability_gender.columns = ['Survivors_by_Sex_Total', 'Proportion_by_Sex']  # columns renaming for readability and easy manipulation

In [8]:
probability_gender['Survival_Prob'] = (probability_gender['Proportion_by_Sex'] *
                                       probability_gender['Survivors_by_Sex_Total'])  # prob of surviving

probability_gender['Non_Survival_Prob'] = (probability_gender['Proportion_by_Sex'] *
                                           (1 - probability_gender['Survivors_by_Sex_Total']))  # prob of not surviving

In [9]:
round(probability_gender[['Survival_Prob', 'Non_Survival_Prob']] * 100,
      2)  # rounded and * 100 for readability

Unnamed: 0_level_0,Survival_Prob,Non_Survival_Prob
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,26.15,9.09
male,12.23,52.53


In [10]:
probability_gender[['Survival_Prob', 'Non_Survival_Prob']].sum().sum()  # Checking that the sum of all probabilities = 1

1.0

##### Q6: Display in a DataFrame the probability of survival/not survival of all combinations of sex and class

In [11]:
# First to create series with the proportion of each gender in each class

proportion_sex_class = titanic.groupby(['Sex', 'Pclass'])[['Sex', 'Pclass']].size() / titanic.index.size

# Then to create another dataset with the proportion of survivors in each class

proportion_survived_sex_class = titanic.groupby(['Sex', 'Pclass'])['Survived'].mean()

# Since the proportion in each class by gender multiplied by the survivors in each class by gender will give us the probabilties, the datasets need to be merged.

probability_class = pd.DataFrame(proportion_survived_sex_class).merge(pd.DataFrame(proportion_sex_class), on=['Sex', 'Pclass'])  # since series can't be merged

probability_class.columns = ['Survived', 'Proportion_Sex']  # for easy manipulation

In [12]:
probability_class['Survived_Prob'] = (probability_class['Proportion_Sex'] *
                                      probability_class['Survived'])  # Probability of survivors gender/class

probability_class['Not_Survived_Prob'] = (probability_class['Proportion_Sex'] *
                                          (1-probability_class['Survived']))  # Probability of non survivors gender/class

In [13]:
round(probability_class[['Survived_Prob', 'Not_Survived_Prob']] * 100,
      2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived_Prob,Not_Survived_Prob
Sex,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
female,1,10.21,0.34
female,2,7.86,0.67
female,3,8.08,8.08
male,1,5.05,8.64
male,2,1.91,10.21
male,3,5.27,33.67


In [14]:
probability_class[['Survived_Prob', 'Not_Survived_Prob']].sum().sum()  # Checking that the sum of all probabilities = 1

1.0

##### Q7: Devise some strategy to impute the missing values in the Age column.

In [15]:
# The strategy was to impute the average age for user that correspond to the same gender, class and survival group.
# A transform function is useful for this strategy because it returns an output of the same size as the input (in this case the Age column), after carrying out
# some function on the underlying data 
# In this case a group mean is chosen because normally age is not a variable that is easily skewed

titanic['Age'] = titanic.groupby(['Sex', 'Pclass', 'Survived'])['Age'].transform(lambda x: x.fillna(x.mean()))  

In [16]:
titanic[titanic['Age'].isnull()]  # To check that no null values remain in the Age column

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
