In [7]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
titanic_data = pd.read_csv("titanic.csv")

# Select relevant features and handle missing values
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
y = titanic_data['Survived']
X = titanic_data[features]
X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})  # Encode categorical feature
X = X.fillna(X.mean())  # Fill missing values with mean

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features for SVM and logistic regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use same scaling for test set


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})  # Encode categorical feature


In [8]:
log_reg = LogisticRegression()

# Train the model using scaled features
log_reg.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_logreg = log_reg.predict(X_test_scaled)

# Evaluate model performance (e.g., using accuracy, precision, recall, F1-score)
from sklearn.metrics import accuracy_score
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.7988826815642458


In [9]:
svm = SVC(kernel='linear')  # Consider using 'rbf' or other kernels for non-linear data

# Train the model using scaled features
svm.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_svm = svm.predict(X_test_scaled)

# Evaluate model performance
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.7821229050279329


In [10]:
dt = DecisionTreeClassifier(max_depth=3)

# Train the model using unscaled features
dt.fit(X_train, y_train)  # Decision trees don't require feature scaling

# Make predictions on the test set
y_pred_dt = dt.predict(X_test)

# Evaluate model performance
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Accuracy: 0.7988826815642458


In [11]:
# Compare the accuracy scores of the three models and choose the best one
best_accuracy = max(accuracy_score(y_test, y_pred_logreg), accuracy_score(y_test, y_pred_svm), accuracy_score(y_test, y_pred_dt))
best_model = "Logistic Regression" if best_accuracy == accuracy_score(y_test, y_pred_logreg) else "SVM" if best_accuracy == accuracy_score(y_test, y_pred_svm) else "Decision Tree"
print("Best model:", best_model)

Best model: Logistic Regression
