In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
df = pd.read_csv("titanic.csv")
#test = pd.read_csv("test.csv")

In [None]:

print(df.head())
print(df.info())

# Data Preprocessing

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True) # Fill missing Age with median
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)# fill missing Embarked with mode

In [None]:
df.drop(columns=['Cabin'], inplace=True) # Drop Cabin due to too many missing values

In [None]:
# Encode categorical variables
label = LabelEncoder()
df['Sex'] = label.fit_transform(df['Sex'])
df['Embarked'] = label.fit_transform(df['Embarked'])

# Feature Engineering

In [None]:
# Family size
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Extract Title
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# Replace uncommon titles with 'Rare'
df['Title'] = df['Title'].replace(['Lady','Countess','Capt','Col','Don','Dr',
                                   'Major','Rev','Sir','Jonkheer','Dona'],'Rare')

# Normalize similar titles
df['Title'] = df['Title'].replace({'Mlle':'Miss','Ms':'Miss','Mme':'Mrs'})

# Encode Titles numerically
le = LabelEncoder()
df['Title'] = le.fit_transform(df['Title'])


In [None]:
# Drop unnecessary columns
df.drop(columns=['Name','Ticket','PassengerId'], inplace=True)

In [None]:
print(df.head())

### Train-Test Split

In [None]:
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_val)



In [None]:
### Logistic Regression

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_val)

In [None]:

print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))
print(classification_report(y_val, y_pred_lr))


In [None]:
### Decision Tree

In [None]:
dt = DecisionTreeClassifier(max_depth=4, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_val)


In [None]:

print("Decision Tree Accuracy:", accuracy_score(y_val, y_pred_dt))
print(classification_report(y_val, y_pred_dt))


In [None]:
#### plot Decision Tree

In [None]:
plt.figure(figsize=(16,10))
plot_tree(dt, feature_names=X.columns, class_names=['Died','Survived'], filled=True)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

In [None]:
print("Accuracy:", accuracy_score(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))

In [None]:
### Linear Regression

In [None]:
from sklearn.linear_model import  LinearRegression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = np.round(lin_reg.predict(X_val)) 


In [None]:
print("Accuracy:", accuracy_score(y_val, y_pred_lin))
print(classification_report(y_val, y_pred_lin))

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier( eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_val)


In [None]:
print("Accuracy:", accuracy_score(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))

In [None]:
### Confusion Matrices

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(14,12))  
axes = axes.flatten()  # make indexing easy

# Logistic Regression
sns.heatmap(confusion_matrix(y_val, y_pred_lr), annot=True, fmt="d", cmap="Blues", ax=axes[0])
axes[0].set_title("Logistic Regression")

# Decision Tree
sns.heatmap(confusion_matrix(y_val, y_pred_dt), annot=True, fmt="d", cmap="Greens", ax=axes[1])
axes[1].set_title("Decision Tree")

# Random Forest
sns.heatmap(confusion_matrix(y_val, y_pred_rf), annot=True, fmt="d", cmap="Oranges", ax=axes[2])
axes[2].set_title("Random Forest")

# XGBoost
sns.heatmap(confusion_matrix(y_val, y_pred_xgb), annot=True, fmt="d", cmap="Purples", ax=axes[3])
axes[3].set_title("XGBoost")

# Linear Regression
sns.heatmap(confusion_matrix(y_val, y_pred_lin), annot=True, fmt="d", cmap="Reds", ax=axes[4])
axes[4].set_title("Linear Regression")


fig.delaxes(axes[5])

plt.tight_layout()
plt.show()



In [None]:
### Cross Validation

In [None]:
cv_results = {
    "Logistic Regression": cross_val_score(log_reg, X, y, cv=5).mean(),
    "Decision Tree": cross_val_score(dt, X, y, cv=5).mean(),
    "Random Forest": cross_val_score(rf, X, y, cv=5).mean(),
    "XGBoost": cross_val_score(xgb, X, y, cv=5).mean()
}
print("Cross Validation Scores")
for model, score in cv_results.items():
    print(f"{model}: {score:.4f}")


In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define scorers
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

results = {}

for name, model in models.items():
    scores = cross_validate(model, X_train, y_train, cv=5, scoring=scoring)
    results[name] = {
        'Accuracy': scores['test_accuracy'].mean(),
        'Precision': scores['test_precision'].mean(),
        'Recall': scores['test_recall'].mean(),
        'F1-score': scores['test_f1'].mean(),
        'ROC-AUC': scores['test_roc_auc'].mean()
    }
results

In [None]:
import pandas as pd

results_df = pd.DataFrame(results).T  # transpose to make models rows
results_df = results_df.round(3)      # round for readability
print(results_df)


In [None]:
results_df.plot(kind='bar', figsize=(12,6))
plt.title("Model Comparison")
plt.ylabel("Score")
plt.show()


In [None]:
feature_importances = pd.Series(xgb.feature_importances_, index=X_train.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Features (XGBoost)")
plt.show()
