# Titanic ML Project
This notebook includes all steps: loading data, preprocessing, training 8 models, saving predictions, and visualizing accuracy.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Load and preprocess data
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

X = df.drop("Survived", axis=1)
y = df["Survived"]
# Split the data into training and testing sets (80% train, 20% test)
# Stratify ensures class balance (same ratio of survived/not survived)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
os.makedirs("Data/Results", exist_ok=True)

In [None]:
# Train models and save predictions
accuracies = {}

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
preds = lr.predict(X_test_scaled)
accuracies["Logistic Regression"] = accuracy_score(y_test, preds)
pd.DataFrame(preds, columns=["Predicted"]).to_csv("Data/Results/predictions_LogisticRegression_model.csv", index=False)

# SVM
svm = SVC()
svm.fit(X_train_scaled, y_train)
preds = svm.predict(X_test_scaled)
accuracies["SVM"] = accuracy_score(y_test, preds)
pd.DataFrame(preds, columns=["Predicted"]).to_csv("Data/Results/predictions_SVM_model.csv", index=False)

# KNN
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
preds = knn.predict(X_test_scaled)
accuracies["KNN"] = accuracy_score(y_test, preds)
pd.DataFrame(preds, columns=["Predicted"]).to_csv("Data/Results/predictions_KNN_model.csv", index=False)

# Decision Tree
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
preds = tree.predict(X_test)
accuracies["Decision Tree"] = accuracy_score(y_test, preds)
pd.DataFrame(preds, columns=["Predicted"]).to_csv("Data/Results/predictions_DecisionTree_model.csv", index=False)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
accuracies["Random Forest"] = accuracy_score(y_test, preds)
pd.DataFrame(preds, columns=["Predicted"]).to_csv("Data/Results/predictions_RandomForest_model.csv", index=False)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
preds = nb.predict(X_test)
accuracies["Naive Bayes"] = accuracy_score(y_test, preds)
pd.DataFrame(preds, columns=["Predicted"]).to_csv("Data/Results/predictions_NaiveBayes_model.csv", index=False)

# ANN
ann = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)
ann.fit(X_train_scaled, y_train)
preds = ann.predict(X_test_scaled)
accuracies["ANN"] = accuracy_score(y_test, preds)
pd.DataFrame(preds, columns=["Predicted"]).to_csv("Data/Results/predictions_ANN_model.csv", index=False)

# Linear Regression
linreg = LinearRegression()
linreg.fit(X_train_scaled, y_train)
preds = linreg.predict(X_test_scaled)
binary_preds = (preds > 0.5).astype(int)
accuracies["Linear Regression"] = accuracy_score(y_test, binary_preds)
pd.DataFrame(binary_preds, columns=["Predicted"]).to_csv("Data/Results/predictions_LinearRegression_model.csv", index=False)

In [None]:
# Plotting model accuracy
df_acc = pd.DataFrame(list(accuracies.items()), columns=["Model", "Accuracy"]).sort_values(by="Accuracy", ascending=False)

plt.figure(figsize=(12, 8))
sns.set(style="whitegrid")
barplot = sns.barplot(x="Model", y="Accuracy", data=df_acc, palette="Set2")
plt.title("Model Accuracy Comparison", fontsize=16)
plt.xlabel("Model")
plt.ylabel("Accuracy")

# Annotate accuracy values
for i, (model, acc) in enumerate(zip(df_acc["Model"], df_acc["Accuracy"])):
    barplot.text(i, acc + 0.002, f"{acc:.4f}", ha='center', color='black')

# Add table below chart
plt.table(cellText=df_acc.values,
          colLabels=df_acc.columns,
          cellLoc='center',
          loc='bottom',
          bbox=[0.0, -0.4, 1, 0.3])

plt.subplots_adjust(bottom=0.35)
plt.xticks(rotation=15)
plt.tight_layout()
plt.savefig("model_accuracy_chart.png", dpi=300)
plt.show()