In [32]:
import pandas as pd
import numpy as np
train_data = pd.read_csv('data/titanic/train.csv')
test_data = pd.read_csv('data/titanic/test.csv')
test_result = pd.read_csv('data/test_augmented.csv')['Survived']
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [33]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [34]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import recall_score, precision_score

num_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)
str_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)
nums = ['Age', 'Fare']
strs = ['Pclass','Embarked', 'Sex', 'SibSp', 'Parch']

X = train_data.loc[:, nums + strs]
Y = train_data.loc[:, 'Survived']

X_test = test_data.loc[:, nums + strs]
Y_test = test_result


titanic_preprocessing = ColumnTransformer([
    ('num', num_pipeline, nums),
    ('str', str_pipeline, strs)
])

# RandomForest
train_model = make_pipeline(
    titanic_preprocessing,
    RandomForestClassifier(random_state=42)
)
train_model.fit(X, Y)

Y_pred = train_model.predict(X_test)
print(f"mse: {mean_squared_error(test_result, Y_pred):.4f}")
print(f"r2: {r2_score(Y_test, Y_pred)}")
print(f"accuracy: {accuracy_score(Y_test, Y_pred)}")
print(f"recall: {recall_score(Y_test, Y_pred)}")
print(f"precision{precision_score(Y_test, Y_pred)}")

mse: 0.2560
r2: -0.08875365141187919
accuracy: 0.7440191387559809
recall: 0.6455696202531646
precision0.6666666666666666


In [35]:
# SVM
from sklearn.svm import SVC

train_model2 = make_pipeline(
    titanic_preprocessing,
    SVC(kernel="rbf", random_state=42)
)
train_model2.fit(X, Y)

Y_pred = train_model2.predict(X_test)
print(f"mse: {mean_squared_error(test_result, Y_pred):.4f}")
print(f"r2: {r2_score(Y_test, Y_pred)}")
print(f"accuracy: {accuracy_score(Y_test, Y_pred)}")
print(f"recall: {recall_score(Y_test, Y_pred)}")
print(f"precision{precision_score(Y_test, Y_pred)}")

mse: 0.2177
r2: 0.07405063291139247
accuracy: 0.7822966507177034
recall: 0.6582278481012658
precision0.7375886524822695


In [36]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression

train_model3 = make_pipeline(
    titanic_preprocessing,
    LogisticRegression()
)
train_model3.fit(X, Y)

Y_pred = train_model3.predict(X_test)
print(f"mse: {mean_squared_error(test_result, Y_pred):.4f}")
print(f"r2: {r2_score(Y_test, Y_pred)}")
print(f"accuracy: {accuracy_score(Y_test, Y_pred)}")
print(f"recall: {recall_score(Y_test, Y_pred)}")
print(f"precision{precision_score(Y_test, Y_pred)}")

mse: 0.2297
r2: 0.023174294060370082
accuracy: 0.7703349282296651
recall: 0.7088607594936709
precision0.691358024691358


In [37]:
# VotingClassifier
from sklearn.ensemble import VotingClassifier
voting_clf = make_pipeline(
    titanic_preprocessing,
    VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(random_state=42)),
            ('lr', LogisticRegression())
        ],
        voting='soft'
    )
)
voting_clf.fit(X, Y)
Y_pred = voting_clf.predict(X_test)
print(f"mse: {mean_squared_error(test_result, Y_pred):.4f}")
print(f"r2: {r2_score(Y_test, Y_pred)}")
print(f"accuracy: {accuracy_score(Y_test, Y_pred)}")
print(f"recall: {recall_score(Y_test, Y_pred)}")
print(f"precision{precision_score(Y_test, Y_pred)}")

mse: 0.2225
r2: 0.05370009737098347
accuracy: 0.777511961722488
recall: 0.6518987341772152
precision0.7304964539007093
