In [1]:
import pandas as pd

def clean_titanic(filepath):
    df = pd.read_csv(filepath)

    print("=" * 60)
    print("TITANIC DATA CLEANING REPORT")
    print("=" * 60)

    # Select only needed columns
    df = df[
        ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    ]

    print("Initial shape:", df.shape)

    # Encode Sex
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

    # Encode Embarked
    df["Embarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2})

    print("\nNaN counts:")
    print(df.isnull().sum())

    # Drop rows with missing values
    df = df.dropna()

    print("\nFinal shape after dropna:", df.shape)
    print("=" * 60)

    return df


df = clean_titanic("Titanic-Dataset.csv")
print(df.head())


TITANIC DATA CLEANING REPORT
Initial shape: (891, 8)

NaN counts:
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

Final shape after dropna: (712, 8)
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    0  22.0      1      0   7.2500       0.0
1         1       1    1  38.0      1      0  71.2833       1.0
2         1       3    1  26.0      0      0   7.9250       0.0
3         1       1    1  35.0      1      0  53.1000       0.0
4         0       3    0  35.0      0      0   8.0500       0.0


In [2]:
# ===============================
# HELPER FUNCTIONS
# ===============================

def find_mean(dataset):
    return sum(dataset) / len(dataset)

def find_standard_deviation(dataset, mean):
    return (sum((x - mean) ** 2 for x in dataset) / len(dataset)) ** 0.5


# ===============================
# COMPUTE FEATURE MEANS & STDS
# ===============================

feature_means = []
feature_stds = []

FEATURE_COLUMNS = [
    "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"
]

for col in FEATURE_COLUMNS:
    values = df[col].tolist()
    mean = find_mean(values)
    std = find_standard_deviation(values, mean)
    if std == 0:
        std = 1.0

    feature_means.append(mean)
    feature_stds.append(std)

print(len(feature_means), len(feature_stds))  # MUST be 7 7


# ===============================
# NORMALIZE
# ===============================

df_normalized = df.copy()

for i, col in enumerate(FEATURE_COLUMNS):
    df_normalized[col] = [
        (x - feature_means[i]) / feature_stds[i]
        for x in df[col]
    ]


# ===============================
# CREATE X AND y
# ===============================

X = df_normalized[FEATURE_COLUMNS].values.tolist()
y = df["Survived"].values.tolist()

n_samples = len(X)
n_features = len(FEATURE_COLUMNS)

print("Samples:", n_samples, "Features:", n_features)


7 7
Samples: 712 Features: 7


In [3]:
import math
import random

learning_rate = 0.1
lambda_ = 0.001

weight = [random.uniform(-0.01, 0.01) for _ in range(n_features)]
b = 0.0


def sigmoid(z):
    if z >= 0:
        return 1 / (1 + math.exp(-z))
    else:
        ez = math.exp(z)
        return ez / (1 + ez)


def find_weighted_sum_for_row(row):
    return sum(weight[i] * row[i] for i in range(n_features)) + b


def train_one_epoch():
    global b
    preds = [sigmoid(find_weighted_sum_for_row(X[j])) for j in range(n_samples)]

    for i in range(n_features):
        grad = sum((preds[j] - y[j]) * X[j][i] for j in range(n_samples)) / n_samples
        grad += lambda_ * weight[i]
        weight[i] -= learning_rate * grad

    b_grad = sum(preds[j] - y[j] for j in range(n_samples)) / n_samples
    b -= learning_rate * b_grad


def train_model(epochs):
    for epoch in range(epochs):
        train_one_epoch()
        if epoch % 1000 == 0:
            print("Epoch", epoch)


train_model(1000)

print("Weights:", weight)
print("Bias:", b)

# test predictions
for i in range(5):
    print(sigmoid(find_weighted_sum_for_row(X[i])))


Epoch 0
Weights: [-1.0123529119715262, 1.2449341311988682, -0.6205869514855857, -0.33620898504098995, -0.05013161422590291, 0.11097357038848987, 0.05142423530932826]
Bias: -0.5159778752663623
0.09253860964423902
0.9066390078481562
0.6215825938254498
0.9059375065916172
0.07749369296713754


In [5]:
def find_sigmoid_for_sample(sample):
    weighted_sum = sum(weight[i] * sample[i] for i in range(len(weight))) + b
    print("Weighted sum:", weighted_sum)
    return sigmoid(weighted_sum)

def normalize_sample(sample):
    return [
        (sample[i] - feature_means[i]) / feature_stds[i]
        for i in range(len(sample))
    ]


sample = [
    3,      # Pclass
    0,      # Sex (male)
    25,     # Age
    0,      # SibSp
    0,      # Parch
    7.25,   # Fare
    0       # Embarked (S)
]


normalized_sample = normalize_sample(sample)

print("Normalized sample:", normalized_sample)
print("Survival probability:", find_sigmoid_for_sample(normalized_sample))

sample2 = [
    1,      # Pclass
    1,      # Sex (female)
    30,     # Age
    1,      # SibSp
    1,      # Parch
    100.0,  # Fare
    1       # Embarked (C)
]

normalized_sample2 = normalize_sample(sample2)

print("\nNormalized sample 2:", normalized_sample2)
print("Survival probability:", find_sigmoid_for_sample(normalized_sample2))

Normalized sample: [0.9085997380242598, -0.7561375069124817, -0.32052560120428997, -0.5527137238424564, -0.5067873731938901, -0.5163799235734484, -0.5012257320191038]
Weighted sum: -2.05007513496581
Survival probability: 0.11404478952972678

Normalized sample 2: [-1.4829825668676924, 1.3225107746384335, 0.024712658941866902, 0.5225107881133605, 0.664747073929648, 1.2368798463153154, 1.4174448120540244]
Weighted sum: 2.617580615882589
Survival probability: 0.931984503335202


In [None]:
def evaluate_model(y_true, y_pred, y_proba=None, set_name="Test Set"):
    """
    Comprehensive model evaluation function.
    
    Args:
        y_true: True labels
        y_pred: Predicted labels
        y_proba: Predicted probabilities (optional)
        set_name: Name of the dataset being evaluated
    """
    # Calculate metrics
    def accuracy(y_true, y_pred):
        correct = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == y_pred[i])
        return correct / len(y_true)
    
    def precision(y_true, y_pred):
        tp = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == 1 and y_pred[i] == 1)
        fp = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == 0 and y_pred[i] == 1)
        return tp / (tp + fp) if (tp + fp) > 0 else 0
    
    def recall(y_true, y_pred):
        tp = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == 1 and y_pred[i] == 1)
        fn = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == 1 and y_pred[i] == 0)
        return tp / (tp + fn) if (tp + fn) > 0 else 0
    
    def f1_score(y_true, y_pred):
        p = precision(y_true, y_pred)
        r = recall(y_true, y_pred)
        return 2 * (p * r) / (p + r) if (p + r) > 0 else 0
    
    def confusion_matrix(y_true, y_pred):
        tp = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == 1 and y_pred[i] == 1)
        tn = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == 0 and y_pred[i] == 0)
        fp = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == 0 and y_pred[i] == 1)
        fn = sum(1 for i in range(len(y_true)) if y_true.iloc[i] == 1 and y_pred[i] == 0)
        return {"TP": tp, "TN": tn, "FP": fp, "FN": fn}
    
    # Calculate all metrics
    acc = accuracy(y_true, y_pred)
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    
    # Display results
    print("\n" + "="*60)
    print(f"MODEL EVALUATION - {set_name.upper()}")
    print("="*60)
    
    print(f"\nMetrics:")
    print(f"  Accuracy:  {acc:.4f} ({acc*100:.2f}%)")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    
    print(f"\nConfusion Matrix:")
    print(f"                Predicted")
    print(f"                Not Potable | Potable")
    print(f"  Actual Not P. | {cm['TN']:<11} | {cm['FP']}")
    print(f"         Potable| {cm['FN']:<11} | {cm['TP']}")
    
    print(f"\nDetailed Breakdown:")
    print(f"  True Positives (TP):  {cm['TP']} - Correctly predicted Potable")
    print(f"  True Negatives (TN):  {cm['TN']} - Correctly predicted Not Potable")
    print(f"  False Positives (FP): {cm['FP']} - Incorrectly predicted Potable")
    print(f"  False Negatives (FN): {cm['FN']} - Incorrectly predicted Not Potable")
    
    if y_proba:
        print(f"\nProbability Statistics:")
        print(f"  Min probability: {min(y_proba):.4f}")
        print(f"  Max probability: {max(y_proba):.4f}")
        print(f"  Mean probability: {sum(y_proba)/len(y_proba):.4f}")
    
    print("="*60)
    
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "confusion_matrix": cm
    }