In [1]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# 1. Load Titanic dataset from seaborn
df = sns.load_dataset("titanic")

# Preview
print(df.head(5))

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [5]:
# 2. Select features + target
# We'll use: class (pclass), sex, age, sibsp, parch, fare, embarked
# passenger class (1st, 2nd, 3rd).
# sibsp: number of siblings/spouses aboard.
# parch: number of parents/children aboard.
# fare: ticket fare.
# embarked: port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton).

features = ["pclass", "sex", "age", "sibsp", "parch", "fare", "embarked"]
target = "survived"

X = df[features]
y = df[target]

In [None]:
# 3. Handle missing values
# Titanic dataset has some missing values.
# For age → we replace missing values with the median age.
# For embarked → we replace missing values with the most common port (mode).
X = X.fillna({
    "age": X["age"].median(),
    "embarked": X["embarked"].mode()[0]
})

In [None]:
# 4. Convert categorical variables → one-hot encoding
# ML models need numbers, not words.
# sex, embarked, pclass are categorical (male/female, C/Q/S).
# pd.get_dummies() converts categories into 0/1 dummy variables.
# drop_first=True avoids duplicate info (to prevent multicollinearity).
# Example:
# sex → becomes sex_male (1 = male, 0 = female).
# embarked → becomes embarked_Q, embarked_S (C is dropped automatically).
X = pd.get_dummies(X, drop_first=True)

In [None]:
# 5. Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Train logistic regression model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# 7. Predict on test set
y_pred = model.predict(X_test)

# 8. Evaluate performance
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
