
# Titanic Dataset: Baseline Machine Learning Model

This notebook builds a **baseline ML pipeline** on the Titanic dataset.
Goal: establish a simple, interpretable benchmark before advanced modeling.


In [None]:

# Core libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

sns.set(style="whitegrid")


## 1. Load Dataset

In [None]:

df = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")
df.head()


## 2. Basic Cleaning

In [None]:

# Drop high-missing / non-informative columns
df.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True)

# Fill Age by median per class
df['Age'] = df['Age'].fillna(
    df.groupby('Pclass')['Age'].transform('median')
)

# Fill Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

df.isnull().sum()


## 3. Feature Engineering

In [None]:

# Family size
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Is Alone
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

df.head()


## 4. Encoding Categorical Variables

In [None]:

df_encoded = pd.get_dummies(
    df,
    columns=['Sex', 'Embarked'],
    drop_first=True
)

df_encoded.head()


## 5. Train / Test Split

In [None]:

X = df_encoded.drop('Survived', axis=1)
y = df_encoded['Survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


## 6. Feature Scaling

In [None]:

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## 7. Baseline Model: Logistic Regression

In [None]:

model = LogisticRegression(max_iter=1000)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)


## 8. Model Evaluation

In [None]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


## 9. Confusion Matrix

In [None]:

cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


## 10. Feature Importance (Coefficients)

In [None]:

importance = pd.Series(
    model.coef_[0],
    index=X.columns
).sort_values()

importance.plot(kind='barh', figsize=(8,6))
plt.title("Logistic Regression Feature Importance")
plt.show()



## Conclusion

This baseline Logistic Regression model provides:
- A clear performance benchmark
- Interpretable feature importance
- A strong foundation for advanced models (Random Forest, XGBoost)

Typical baseline accuracy ranges between **78% â€“ 82%**.
