In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load processed data
df = pd.read_csv("/content/drive/MyDrive/Data Science and ML course/Titanic_Dataset/train.csv")
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,3,22.0,1,0,7.25,2,0,True,False,True,False,True,False,False
1,1,1,38.0,1,0,71.2833,2,0,False,False,False,False,False,True,False
2,1,3,26.0,0,0,7.925,1,1,False,False,True,True,False,False,False
3,1,1,35.0,1,0,53.1,2,0,False,False,True,False,False,True,False
4,0,3,35.0,0,0,8.05,1,1,True,False,True,False,True,False,False


In [4]:
# Split data into features and target
X = df.drop("Survived", axis=1)
y = df["Survived"]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Define and train models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"✅ {name} Accuracy: {acc:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("-" * 50)

✅ Logistic Regression Accuracy: 0.8156
[[89 16]
 [17 57]]
              precision    recall  f1-score   support

           0       0.84      0.85      0.84       105
           1       0.78      0.77      0.78        74

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

--------------------------------------------------
✅ Random Forest Accuracy: 0.8380
[[90 15]
 [14 60]]
              precision    recall  f1-score   support

           0       0.87      0.86      0.86       105
           1       0.80      0.81      0.81        74

    accuracy                           0.84       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179

--------------------------------------------------
✅ Decision Tree Accuracy: 0.7654
[[84 21]
 [21 53]]
              precision    recall  f1-score   support

           0       0.80    

In [8]:
# Cross-validation for model comparison
cv_results = {}
for name, model in models.items():
    score = cross_val_score(model, X, y, cv=5, scoring="accuracy").mean()
    cv_results[name] = score

print("📊 Cross-Validation Scores:")
for name, score in cv_results.items():
    print(f"{name}: {score:.4f}")


📊 Cross-Validation Scores:
Logistic Regression: 0.8114
Random Forest: 0.8014
Decision Tree: 0.7710
Support Vector Machine: 0.6713


In [15]:
import os

# Create the models directory if it doesn't exist
os.makedirs("/content/drive/MyDrive/Data Science and ML course/Titanic_Dataset/models", exist_ok=True)


In [16]:
# Save best model
# Import the necessary library
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Reload the full processed data
df = pd.read_csv("/content/drive/MyDrive/Data Science and ML course/Titanic_Dataset/train.csv")

# Separate features and target
X = df.drop("Survived", axis=1)
y = df["Survived"]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train the best model (Logistic Regression)
best_model = LogisticRegression()
best_model.fit(X_scaled, y)

# Save the trained model and scaler
joblib.dump(best_model, "/content/drive/MyDrive/Data Science and ML course/Titanic_Dataset/models/best_model_logistic_regression.pkl")
joblib.dump(scaler, "/content/drive/MyDrive/Data Science and ML course/Titanic_Dataset/models/scaler.pkl")

print("✅ Best model (Logistic Regression) and scaler saved successfully!")



✅ Best model (Logistic Regression) and scaler saved successfully!
