The Titanic competion is a classification problem: given some info on a passenger, we determine if they survived.

So, two categories represented by the "Survived" boolean columns in the dataset: Yes=1 and Dead=0.

In [1]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

Setup Complete


# Step 1: Gather the Data

### Checking the format of the data

In [2]:
train_data = pd.read_csv("../input/train.csv", index_col="PassengerId")
test_data = pd.read_csv("../input/test.csv", index_col="PassengerId")

train_data.head()
# test_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


# Step 2: Prepare the Data

In [4]:
# Separate target from predictors
y = train_data['Survived']
X_full = train_data.drop(['Survived'], axis=1)

In [5]:
(y == 1).sum()

342

In [6]:
# Drop columns with missing values (simplest approach)
# cols_with_na = [col for col in X_full.columns if X_full[col].isnull().any()]
cols_with_na = []
X_reduced = X_full.drop(cols_with_na, axis=1)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
def has_low_cardinality(df, col):
    is_categorical = (df[col].dtype == "object")
    has_low_card = (df[col].nunique() < 10)
    return (is_categorical and has_low_card)

categorical_cols = [col for col in X_reduced.columns if has_low_cardinality(X_reduced, col)]

# Select numerical columns
numerical_cols = [col for col in X_reduced.columns if X_reduced[col].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols

X = X_reduced[my_cols].copy()
X_test = test_data[my_cols].copy()

print("Predictors are : ", my_cols)
print("Numerical cols : ", numerical_cols)
print("Categorical cols : ", categorical_cols)

Predictors are :  ['Sex', 'Embarked', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Numerical cols :  ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical cols :  ['Sex', 'Embarked']


In [7]:
X.head()

Unnamed: 0_level_0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,male,S,3,22.0,1,0,7.25
2,female,C,1,38.0,1,0,71.2833
3,female,S,3,26.0,0,0,7.925
4,female,S,1,35.0,1,0,53.1
5,male,S,3,35.0,0,0,8.05


In [8]:
# Divide data into training and validation subsets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

### Defining the Preprocessing Pipeline

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='error', drop='if_binary'))
    ('onehot', OneHotEncoder(handle_unknown='error'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Checking the preprocessing pipeline

In [10]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_valid_transformed = preprocessor.transform(X_valid)

from aux_functions import get_column_names_from_ColumnTransformer
feature_names = get_column_names_from_ColumnTransformer(preprocessor)

print(X_train.head())
print(X_train_transformed[:5])

                Sex Embarked  Pclass   Age  SibSp  Parch     Fare
PassengerId                                                      
141          female        C       3   NaN      0      2  15.2458
440            male        S       2  31.0      0      0  10.5000
818            male        C       2  31.0      1      1  37.0042
379            male        C       3  20.0      0      0   4.0125
492            male        S       3  21.0      0      0   7.2500
[[ 3.      0.      0.      2.     15.2458  1.      0.      1.      0.
   0.    ]
 [ 2.     31.      0.      0.     10.5     0.      1.      0.      0.
   1.    ]
 [ 2.     31.      1.      1.     37.0042  0.      1.      1.      0.
   0.    ]
 [ 3.     20.      0.      0.      4.0125  0.      1.      1.      0.
   0.    ]
 [ 3.     21.      0.      0.      7.25    0.      1.      0.      0.
   1.    ]]


In [11]:
[feature_names[i] for i in range(len(feature_names)) if i in (0, 1, 4)]

['Pclass', 'Age', 'Fare']

In [12]:
%matplotlib qt

In [13]:
X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=y_train.index)
A = X_train_transformed.join(y_train)
A.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
141,3.0,0.0,0.0,2.0,15.2458,1.0,0.0,1.0,0.0,0.0,0
440,2.0,31.0,0.0,0.0,10.5,0.0,1.0,0.0,0.0,1.0,0
818,2.0,31.0,1.0,1.0,37.0042,0.0,1.0,1.0,0.0,0.0,0
379,3.0,20.0,0.0,0.0,4.0125,0.0,1.0,1.0,0.0,0.0,0
492,3.0,21.0,0.0,0.0,7.25,0.0,1.0,0.0,0.0,1.0,0


In [14]:
import seaborn as sns
g = sns.pairplot(A, hue='Survived')
g.map_lower(sns.kdeplot, levels=4, color=".2")
plt.tight_layout()
plt.show()

LinAlgError: singular matrix