In [1]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


### Load Data

In [2]:
titanic_file_path = '../input/titanic/train.csv'
survivor_data = pd.read_csv(titanic_file_path, index_col='PassengerId')
survivor_data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [3]:
# Number of rows and columns 
survivor_data.shape

(891, 11)

In [4]:
survivor_data.head()


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
y = survivor_data.Survived
y

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

### Pre-processing data

In [6]:
survivor_data.drop('Name', axis=1, inplace=True)
survivor_data.drop('Ticket', axis=1, inplace=True)

In [7]:
survivor_data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

### Handling Missing Data & Categorical Variables

In [8]:
# Shape of training data (num_rows, num_columns)
print(survivor_data.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (survivor_data.isnull().sum())
print(missing_val_count_by_column)
print(100 * missing_val_count_by_column/survivor_data.shape[0])

(891, 9)
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64
Survived     0.000000
Pclass       0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
dtype: float64


In [9]:
# drop entire `Cabin` column as its above 20% missing data
survivor_data.drop('Cabin', axis=1, inplace=True)

In [10]:
#only missing 2 datapoints to dropna will get rid of both those rows without too much effect
survivor_data.dropna(subset=['Embarked'], inplace=True)

In [11]:
survivor_data.shape

(889, 8)

In [12]:
# Get list of categorical variables to try to get rid of them
s = (survivor_data.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Sex', 'Embarked']


In [13]:
survivor_data.Sex.unique()

array(['male', 'female'], dtype=object)

In [14]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_survivor_data = survivor_data.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_survivor_data[object_cols] = ordinal_encoder.fit_transform(survivor_data[object_cols])

In [15]:
label_survivor_data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1.0,22.0,1,0,7.25,2.0
2,1,1,0.0,38.0,1,0,71.2833,0.0
3,1,3,0.0,26.0,0,0,7.925,2.0
4,1,1,0.0,35.0,1,0,53.1,2.0
5,0,3,1.0,35.0,0,0,8.05,2.0


In [16]:
print("Category to Label Mapping:")
print(ordinal_encoder.categories_)

Category to Label Mapping:
[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]


In [17]:
from sklearn.impute import SimpleImputer

# Fill in the lines below: imputation
my_imputer = SimpleImputer() # Your code here
imputed_survivor_data = pd.DataFrame(my_imputer.fit_transform(label_survivor_data))

# Fill in the lines below: imputation removed column names; put them back
imputed_survivor_data.columns = label_survivor_data.columns

In [18]:
imputed_survivor_data.Age.isnull().sum()

0