In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def load_passenger_data():
  return pd.read_csv(Path("./data/train.csv"))

passenger_data = load_passenger_data()

In [None]:
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(
    passenger_data, test_size=0.2, random_state=42
)

passenger = strat_train_set.copy()

In [None]:
passenger.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          572 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     710 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 72.3+ KB


In [None]:
passenger.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,712.0,572.0,712.0,712.0,712.0
mean,448.234551,0.376404,2.330056,29.498846,0.553371,0.379213,32.586276
std,256.731423,0.484824,0.824584,14.500059,1.176404,0.791669,51.969529
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,224.75,0.0,2.0,21.0,0.0,0.0,7.925
50%,453.5,0.0,3.0,28.0,0.0,0.0,14.4542
75%,673.5,1.0,3.0,38.0,1.0,0.0,30.5
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
passenger_filtered = passenger.drop(["Name", "Cabin", "Ticket"], axis=1)
passenger_sparse = pd.get_dummies(passenger_filtered)
corr_matrix = passenger_sparse.corr()
corr_matrix["Survived"]

PassengerId    0.019979
Survived       1.000000
Pclass        -0.321750
Age           -0.059695
SibSp         -0.047602
Parch          0.078311
Fare           0.246641
Sex_female     0.541750
Sex_male      -0.541750
Embarked_C     0.159632
Embarked_Q    -0.006097
Embarked_S    -0.142371
Name: Survived, dtype: float64

In [None]:
passenger =  strat_train_set.drop("Survived", axis=1)
passenger_labels = strat_train_set["Survived"].copy()

test_passenger =  strat_test_set.drop("Survived", axis=1)
test_passenger_labels = strat_test_set["Survived"].copy()

passenger.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
382,383,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
704,705,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
813,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [None]:
num_attribs = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Sex", "Embarked"]

def sum_columns(X):
  return X[:, [0]] + X[:, [1]]

def sum_name(function_transformer, feature_names_in):
  return "sum"

def sum_pipeline():
  return make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(sum_columns, feature_names_out=sum_name),
    StandardScaler())

def is_child(X):
  X_copy = X.copy()
  X_copy[:, 0] = (X_copy[:, 0] < 13).astype(int)
  return X_copy

def is_alone(X):
  X_copy = X.copy()
  X_copy = X_copy[:, [0]] + X_copy[:, [1]]
  X_copy[:, 0] = (X_copy[:, 0] == 0).astype(int)
  return X_copy

def bool_name(function_transformer, feature_names_in):
  return "bool"

def bool_pipeline(bool_func):
  return make_pipeline(
      SimpleImputer(strategy="median"),
      FunctionTransformer(bool_func, feature_names_out=bool_name),
      StandardScaler())

num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                             StandardScaler())

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("1hot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessing = ColumnTransformer([
        ("family", sum_pipeline(), ["SibSp", "Parch"]),
        ("cat", cat_pipeline, ["Sex", "Embarked"]),
        ("num", num_pipeline, ["Pclass", "Age", "Fare", "SibSp", "Parch"])
    ])
preprocessing

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k_nearest_class = make_pipeline(preprocessing, KNeighborsClassifier())
k_nearest_class.fit(passenger, passenger_labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_score


test_passenger_predictions = k_nearest_class.predict(test_passenger)

k_nearest_acc = accuracy_score(test_passenger_predictions, test_passenger_labels)

k_nearest_acc

0.8100558659217877

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_class = make_pipeline(preprocessing, DecisionTreeClassifier())
tree_class.fit(passenger, passenger_labels)

In [None]:
test_passenger_predictions = tree_class.predict(test_passenger)

tree_acc = accuracy_score(test_passenger_predictions, test_passenger_labels)

tree_acc

0.7821229050279329

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_class = make_pipeline(preprocessing, RandomForestClassifier(random_state=42))
forest_class.fit(passenger, passenger_labels)

test_passenger_predictions = forest_class.predict(test_passenger)

forest_acc = accuracy_score(test_passenger_predictions, test_passenger_labels)

forest_acc

0.8156424581005587

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

x_trees_class = make_pipeline(preprocessing, ExtraTreesClassifier(n_estimators=20, random_state=42))
x_trees_class.fit(passenger, passenger_labels)

test_passenger_predictions = x_trees_class.predict(test_passenger)

x_trees_acc = accuracy_score(test_passenger_predictions, test_passenger_labels)

x_trees_acc

0.7988826815642458

In [None]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("extra_trees", ExtraTreesClassifier(random_state=42)),
])

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {'extra_trees__n_estimators': randint(1, 10000),
                  }

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,
    scoring='accuracy', random_state=42)

rnd_search.fit(passenger, passenger_labels)

In [None]:
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

cv_res = cv_res[["param_extra_trees__n_estimators", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
cv_res.columns = ["n_estimators"] + score_cols
cv_res[score_cols] = cv_res[score_cols].round(5)

cv_res.head(20)

Unnamed: 0,n_estimators,split0,split1,split2,mean_test_rmse
0,7271,0.7437,0.77215,0.81013,0.77533
1,861,0.7437,0.77215,0.81013,0.77533
5,6266,0.7437,0.77215,0.81013,0.77533
9,8323,0.7437,0.77215,0.81013,0.77533
2,5391,0.7437,0.76793,0.81013,0.77392
3,5192,0.7437,0.76793,0.81013,0.77392
4,5735,0.7437,0.76793,0.81013,0.77392
7,4427,0.7437,0.76793,0.81013,0.77392
8,5579,0.7437,0.76793,0.81013,0.77392
6,467,0.7437,0.76793,0.80591,0.77251


In [None]:
final_model = rnd_search.best_estimator_  # includes preprocessing
feature_importances = final_model["extra_trees"].feature_importances_
feature_importances.round(2)

array([0.04, 0.04, 0.15, 0.15, 0.01, 0.01, 0.01, 0.1 , 0.2 , 0.23, 0.04,
       0.02])

In [None]:
sorted(zip(feature_importances,
           final_model["preprocessing"].get_feature_names_out()),
           reverse=True)

TypeError: len() of unsized object

In [None]:
X_test = strat_test_set.drop("Survived", axis=1)
y_test = strat_test_set["Survived"].copy()

final_predictions = final_model.predict(X_test)

final_accuracy = accuracy_score(y_test, final_predictions)
print(final_accuracy)

0.8212290502793296


In [None]:
from math import nan
X_to_be_predicted = pd.read_csv(Path("./data/test.csv"))

In [None]:
result = final_model.predict(X_to_be_predicted)
submission = pd.DataFrame({'PassengerId': X_to_be_predicted.PassengerId, 'Survived': result})
print(submission)
submission.Survived = submission.Survived.astype(int)
print(submission.shape)
filename = 'Titanic Predictions.csv'
submission.to_csv(filename, index=False)
print('Saved file: ' + filename)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         1
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
(418, 2)
Saved file: Titanic Predictions.csv
