In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Pemanggilan data dan pengecekan data

In [2]:
data = pd.read_csv("train.csv")
display(data)
data = data.loc[:, ["Survived","Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked", "Fare", "Cabin"]]
print(data.isna().sum())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
Fare          0
Cabin       687
dtype: int64


## Pembersihan data

In [3]:
# data = data.dropna()
data["Age"].fillna(data["Age"].median(), inplace=True)
data["Embarked"].fillna(data["Embarked"].mode(), inplace=True)
data["Cabin"].fillna("Unknown", inplace=True)
data["Cabin"] = data["Cabin"].apply(lambda x: x[0] if x != 'Unknown' else x)
print(data.isna().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    2
Fare        0
Cabin       0
dtype: int64


## Melakukan one hot encoding

In [4]:
X = data.loc[:,["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Cabin"]]
y = data.loc[:,["Survived"]]
X = pd.get_dummies(X)
display(X)
display(y)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_Unknown
0,3,22.0,1,0,7.2500,0,1,0,0,1,0,0,0,0,0,0,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0,0,0,1,0,0,0,0,0,0
2,3,26.0,0,0,7.9250,1,0,0,0,1,0,0,0,0,0,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,0,1,0,0,1,0,0,0,0,0,0
4,3,35.0,0,0,8.0500,0,1,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,1,0,0,1,0,0,0,0,0,0,0,0,1
887,1,19.0,0,0,30.0000,1,0,0,0,1,0,1,0,0,0,0,0,0,0
888,3,28.0,1,2,23.4500,1,0,0,0,1,0,0,0,0,0,0,0,0,1
889,1,26.0,0,0,30.0000,0,1,1,0,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


## Scalling data

In [5]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = np.ravel(y)

## Mempersiapkan 5 folds validation

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Memanggil Model

In [7]:
log_reg = LogisticRegression(random_state=42, max_iter=10000)
naive_bayes = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)
linear_svm = LinearSVC(random_state=42, max_iter=10000)
svm_rbf = SVC(kernel='rbf', random_state=42)
mlp_1 = MLPClassifier(hidden_layer_sizes=(10, ), max_iter=1000, random_state=42)
mlp_2 = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=3000, random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
gradient_boosting = GradientBoostingClassifier(random_state=42)

models = [
    ('Logistic Regression', log_reg),
    ('Naive Bayes', naive_bayes),
    ('KNN', knn),
    ('Linear SVM', linear_svm),
    ('SVM (RBF Kernel)', svm_rbf),
    ('MLP (1 hidden layer)', mlp_1),
    ('MLP (2 hidden layers)', mlp_2),
    ('Decision Tree', decision_tree),
    ('Random Forest', random_forest),
    ('Gradient Boosting', gradient_boosting)
]

## Melakukan Pelatihan dan menampilkan hasil

In [8]:
result = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1': []
}

warnings.simplefilter(action='ignore', category=FutureWarning)

for model in models:
    scoring = ['accuracy', 'precision', 'recall', 'f1']
    cv_results = cross_validate(model[1], X, y, cv=skf, scoring=scoring)
    
    accuracy = np.mean(cv_results['test_accuracy'])
    precision = np.mean(cv_results['test_precision'])
    recall = np.mean(cv_results['test_recall'])
    f1 = np.mean(cv_results['test_f1'])
    
    result["Model"].append(model[0])
    result["Accuracy"].append(accuracy)
    result["Precision"].append(precision)
    result["Recall"].append(recall)
    result["F1"].append(f1)
    
warnings.resetwarnings()

result = pd.DataFrame(result)
display(result)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.797991,0.745291,0.722038,0.732431
1,Naive Bayes,0.479298,0.454484,0.905925,0.575368
2,KNN,0.800213,0.759511,0.704604,0.73069
3,Linear SVM,0.794608,0.743595,0.710315,0.72576
4,SVM (RBF Kernel),0.804702,0.77055,0.701577,0.733585
5,MLP (1 hidden layer),0.811418,0.808409,0.669608,0.731747
6,MLP (2 hidden layers),0.829352,0.816115,0.719352,0.762972
7,Decision Tree,0.790114,0.727545,0.73393,0.72978
8,Random Forest,0.813653,0.773061,0.73393,0.752066
9,Gradient Boosting,0.837236,0.843658,0.707545,0.769274


In [9]:
best_accuracy = (result.loc[result["Accuracy"].idxmax(), "Model"],result.loc[result["Accuracy"].idxmax(), "Accuracy"])
best_precision = (result.loc[result["Precision"].idxmax(), "Model"],result.loc[result["Precision"].idxmax(), "Precision"])
best_recall = (result.loc[result["Recall"].idxmax(), "Model"],result.loc[result["Recall"].idxmax(), "Recall"])
best_f1 = (result.loc[result["F1"].idxmax(), "Model"],result.loc[result["F1"].idxmax(), "F1"])

print(f"Model with the best accuracy score: {best_accuracy[0]}, with {best_accuracy[1]:.2f}")
print(f"Model with the best precision score: {best_precision[0]}, with {best_precision[1]:.2f}")
print(f"Model with the best recall score: {best_recall[0]}, with {best_recall[1]:.2f}")
print(f"Model with the best f1 score: {best_f1[0]}, with {best_f1[1]:.2f}","\n")

worst_accuracy = (result.loc[result["Accuracy"].idxmin(), "Model"],result.loc[result["Accuracy"].idxmin(), "Accuracy"])
worst_precision = (result.loc[result["Precision"].idxmin(), "Model"],result.loc[result["Precision"].idxmin(), "Precision"])
best_recall = (result.loc[result["Recall"].idxmax(), "Model"],result.loc[result["Recall"].idxmax(), "Recall"])
worst_recall = (result.loc[result["Recall"].idxmin(), "Model"],result.loc[result["Recall"].idxmin(), "Recall"])
worst_f1 = (result.loc[result["F1"].idxmin(), "Model"],result.loc[result["F1"].idxmin(), "F1"])

print(f"Model with the worst accuracy score: {worst_accuracy[0]}, with {worst_accuracy[1]:.2f}")
print(f"Model with the worst precision score: {worst_precision[0]}, with {worst_precision[1]:.2f}")
print(f"Model with the worst recall score: {worst_recall[0]}, with {worst_recall[1]:.2f}")
print(f"Model with the worst f1 score: {worst_f1[0]}, with {worst_f1[1]:.2f}")

Model with the best accuracy score: Gradient Boosting, with 0.84
Model with the best precision score: Gradient Boosting, with 0.84
Model with the best recall score: Naive Bayes, with 0.91
Model with the best f1 score: Gradient Boosting, with 0.77 

Model with the worst accuracy score: Naive Bayes, with 0.48
Model with the worst precision score: Naive Bayes, with 0.45
Model with the worst recall score: MLP (1 hidden layer), with 0.67
Model with the worst f1 score: Naive Bayes, with 0.58


## Kesimpulan

Dari nilai metrik evaluasi yang didapat terlihat bahwa semua Model selain Naive Bayes memiliki tingkat performa yang lumayan mirip satu dengan yang lain. <br>
<br>
Model Gradient descent memiliki tingkat performa yang lebih tinggi dibandingkan yang lain dengan accuracy sebesar 83.7%, precision sebesar 84.3%, recall 70.7%, dan f1 score 76.9%. <br>
<br>
Sebaliknya model Naive Bayes memiliki tingkat performa yang lebih rendah dibandingkan model lain, dengan accuracy sebesar 47.9%, precision sebesar 45.4%, recall 90.5%, dan f1 score 57.5%.  <br>
<br>
Sehingga dapat disimpulkan bahwa model Gradient Descent menjadi model terbaik untuk memprediksi keselamatan penumpang, dan model Naive Bayes menjadi model yang terburuk untuk memprediksi keselamatan penumpang. 