In [None]:
# Import required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [None]:
# Load data
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
train_data.head(5)

In [None]:
# Describe numerical and categorical data
train_data.describe()
train_data.describe(include='object')

In [None]:
# Plot survival rate per gender
plt.figure(figsize=(10,6))
pct_gender = train_data[["Sex", "Survived"]].groupby("Sex").sum() / train_data[["Sex", "Survived"]].groupby("Sex").count()
sns.barplot(data=pct_gender.reset_index(), x='Sex', y="Survived")

In [None]:
# Select features and find numerical and categorical columns
X_cols = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"]
y_col = ["Survived"]
X_cols_num = train_data[X_cols].select_dtypes(include=np.number).columns.tolist()
X_cols_cat = [col for col in X_cols if col not in X_cols_num]

X_train = train_data[X_cols]
y_train = train_data[y_col]

X_test = test_data[X_cols]

In [None]:
# Create transformer to create age group column
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["Age"] = pd.cut(X_train["Age"], bins=5, labels=[1, 2, 3, 4, 5])
        return X_


In [None]:
# Create pipelines for numerical and categorical features and select model
num_pipeline = Pipeline([
    ("transformer", AgeTransformer()),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, X_cols_num),
    ("cat", cat_pipeline, X_cols_cat)
])

model = RandomForestClassifier(n_estimators=100, max_features=8, random_state=0)

final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])

In [None]:
# Fit final pipeline using training data
final_pipeline.fit(X_train, y_train)

In [None]:
# Make predictions on test data
predictions = final_pipeline.predict(X_test)
final_predictions = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Survived": predictions})
final_predictions.head(5)

In [None]:
# Save output
final_predictions.to_csv("./output/titanic_predictions.csv", index=False)