In [4]:
# ID3 Decision Tree Classification on Titanic Dataset
# https://www.kaggle.com/datasets/yasserh/titanic-dataset/data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,  roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

# Load the dataset
data = pd.read_csv('titanic.csv')
data.head()

ModuleNotFoundError: No module named 'pandas'

In [None]:
print(data.info()) # shows columns names, number of non-null values and data type
print(data.isnull().sum()) # shows number of null values in each column

# Feature Selection/Creation

Handled feature creation/selection first because 'Age' column will be filled based on 'Title'(to be created) column .

### Why?
Because I can use the passenger's title appeared in 'Name' column to determin the missing age better than randomly filling it with median of the whole column and no the median of the title itself.

In [None]:
df = data.copy()

In [None]:
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+\.)', expand=False)
df['Title'].unique()

In [None]:
df['Title'] = df['Title'].replace(['Sir.'], 'Mr.')
df['Title'] = df['Title'].replace(['Mme.', 'Lady.', 'Countess.'], 'Mrs.')
df['Title'] = df['Title'].replace(['Ms.', 'Mlle.'], 'Miss.')
df['Title'] = df['Title'].replace(['Dr.', 'Rev.', 'Major.', 'Col.', 'Capt.', 'Jonkheer.', 'Don.'], 'Rare')
df['Title'].unique()

In [None]:
df.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'] , axis=1, inplace=True) # Malo4 lazma + most of it is null

In [None]:
df.head()

# Handle Missing Values


In [None]:
df.isnull().sum()

In [None]:
df['Age'] = df['Age'].fillna(df.groupby('Title')['Age'].transform('median')) # cus mean is sensitive to outliers.

imputer_embarked = SimpleImputer(strategy='most_frequent')
df[['Embarked']] = imputer_embarked.fit_transform(df[['Embarked']])

df.isnull().sum()

# Encoding


Based on the unique values, I'll choose the encoding method.



In [None]:
for col in df.select_dtypes(include='object').columns:
    print(f"'{col}': {df[col].unique()}")

Binary mapping for 'Sex' column.

In [None]:
df['Sex'] = df['Sex'].map({'female': 1, 'male': 0}).astype(int)

One-hot Encoding

In [None]:
df = pd.get_dummies(df, columns=['Title', 'Embarked'], drop_first=True)

In [None]:
df.head()

# Spliting



In [None]:
X = df.drop(columns=['Survived'])
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(y.unique())

# Scaling


In [None]:
sc = StandardScaler()
cols_to_scale = ['Age', 'Fare']

X_train[cols_to_scale] = sc.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = sc.transform(X_test[cols_to_scale])

In [None]:
X_train.head()

# OVER SAMPLING

In [None]:
print(len(y_train[y_train == 1]))
print(len(y_train[y_train == 0]))

ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

print(len(y_train[y_train == 1]))
print(len(y_train[y_train == 0]))

# Training

## DT

In [None]:
DT_model = DecisionTreeClassifier(
    criterion='entropy',
    random_state=42,
    max_depth=5
)
DT_model.fit(X_train, y_train)

# Prediction
DT_y_pred = DT_model.predict(X_test)
DT_y_pred_proba = DT_model.predict_proba(X_test)

## KNN

In [None]:
k = 10
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train, y_train)

KNN_y_pred = knn_model.predict(X_test)
KNN_y_pred_proba = knn_model.predict_proba(X_test)

### Actual vs Predicted comparison


In [None]:
# =================================DT========================================
DT_comparison_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': DT_y_pred,
    'Correct': y_test.values == DT_y_pred
})

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
DT_comparison_df['Actual'].value_counts().plot(kind='bar', color=['skyblue', 'lightcoral'])
plt.title('Actual')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.xticks([0, 1], ['No', 'Yes'], rotation=15)

plt.subplot(1, 2, 2)
DT_comparison_df['Predicted'].value_counts().plot(kind='bar', color=['blue', 'skyblue'])
plt.title('Predicted - DT')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.xticks([0, 1], ['No', 'Yes'], rotation=15)

plt.tight_layout()
plt.savefig('dt_actual_vs_predicted.png')
plt.show()

# =================================KNN========================================
KNN_comparison_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': KNN_y_pred,
    'Correct': y_test.values == KNN_y_pred
})

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
KNN_comparison_df['Actual'].value_counts().plot(kind='bar', color=['skyblue', 'lightcoral'])
plt.title('Actual')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.xticks([0, 1], ['No', 'Yes'], rotation=15)

plt.subplot(1, 2, 2)
KNN_comparison_df['Predicted'].value_counts().plot(kind='bar', color=['red', 'lightcoral'])
plt.title(f'Predicted - KNN\n k={k}')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.xticks([0, 1], ['No', 'Yes'], rotation=15)

plt.tight_layout()
plt.savefig('knn_actual_vs_predicted.png')
plt.show()

## Accuracy

In [None]:
DT_acc = accuracy_score(y_test, DT_y_pred)
print(f"DT Accuracy: {DT_acc * 100:.2f}%")
print("-" * 30)
print(classification_report(y_test, DT_y_pred, target_names=['Did Not Survive', 'Survived']))

print("=" * 60)

KNN_acc = accuracy_score(y_test, KNN_y_pred)
print(f"KNN Accuracy: {KNN_acc * 100:.2f}%")
print("-" * 30)
print(classification_report(y_test, KNN_y_pred, target_names=['Did Not Survive', 'Survived']))

In [None]:
# Receiver Operating Characteristic curve

# ==============DT===============

plt.figure(figsize=(16, 6))

dt_fpr, dt_tpr, dt_thresholds = roc_curve(y_test, DT_y_pred_proba[:, 1])
dt_roc_auc = auc(dt_fpr, dt_tpr)

plt.subplot(1, 2, 1)
plt.plot(dt_fpr, dt_tpr, color='darkblue', lw=2, label=f'ROC curve (area = {dt_roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - DT')
plt.legend(loc="lower right")


# ==============KNN===============

knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, KNN_y_pred_proba[:, 1])
knn_roc_auc = auc(knn_fpr, knn_tpr)

plt.subplot(1, 2, 2)
plt.plot(knn_fpr, knn_tpr, color='darkred', lw=2, label=f'ROC curve (area = {knn_roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC - KNN\n k={k}')
plt.legend(loc="lower right")


plt.tight_layout()
plt.savefig('roc_curves.png')
plt.show()

### Confusion Matrix


In [None]:
plt.figure(figsize=(16, 6))

# Confusion Matrix for Decision Tree
cm_dt = confusion_matrix(y_test, DT_y_pred)
plt.subplot(1, 2, 1)
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Did Not Survive', 'Survived'],
            yticklabels=['Did Not Survive', 'Survived'])
plt.title('Confusion Matrix - Decision Tree')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

# Confusion Matrix for KNN
cm_knn = confusion_matrix(y_test, KNN_y_pred)
plt.subplot(1, 2, 2)
sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Reds',
            xticklabels=['Did Not Survive', 'Survived'],
            yticklabels=['Did Not Survive', 'Survived'])
plt.title(f'Confusion Matrix - KNN\n k={k}')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')


plt.tight_layout()
plt.savefig('confusion_matrices.png')
plt.show()