In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load Titanic dataset
file_path = 'titanic.csv'  # Adjust this if needed
titanic_data = pd.read_csv(file_path)

In [40]:
# Step 1: Preprocessing
# Drop irrelevant features
titanic_data_cleaned = titanic_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# Handle missing values
titanic_data_cleaned['Age'].fillna(titanic_data_cleaned['Age'].mean(), inplace=True)  # Fill Age with mean
titanic_data_cleaned['Embarked'].fillna('missing', inplace=True)  # Fill Embarked with placeholder

# Encode categorical variables
titanic_data_encoded = pd.get_dummies(titanic_data_cleaned, drop_first=True)

# Separate features and target variable
X = titanic_data_encoded.drop(columns=['Survived'])
y = titanic_data_encoded['Survived']

In [42]:
# Step 2: Standardize the data
X_mean = X.mean()
X_std = X.std()
X_standardized = (X - X_mean) / X_std

In [44]:
# Step 3: Compute the covariance matrix
cov_matrix = np.cov(X_standardized.T)

In [46]:
# Step 4: Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

In [48]:
# Step 5: Sort eigenvalues and eigenvectors in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

In [50]:
# Step 6: Project data onto the top 4 principal components
k = 5  # Number of principal components
top_eigenvectors = eigenvectors[:, :k]
X_pca = np.dot(X_standardized, top_eigenvectors)
# Step 7: Explained variance ratio
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
# Print explained variance ratio
print("Explained Variance Ratio:", explained_variance_ratio[:k])

Explained Variance Ratio: [0.2052705  0.19121186 0.17205714 0.10949081 0.09226028]


In [52]:
X=X_pca

In [54]:
import numpy as np

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Loss function (Binary Cross-Entropy)
def binary_cross_entropy(y, y_pred):
    return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

# Gradient Descent for Logistic Regression
def train_logistic_regression(X, y, learning_rate=0.01, epochs=1000):
    m, n = X.shape
    weights = np.zeros(n)
    bias = 0

    for epoch in range(epochs):
        # Linear model
        z = np.dot(X, weights) + bias
        y_pred = sigmoid(z)

        # Compute gradients
        dw = np.dot(X.T, (y_pred - y)) / m
        db = np.sum(y_pred - y) / m

        # Update weights and bias
        weights -= learning_rate * dw
        bias -= learning_rate * db

        # Optionally print loss
        # if epoch % 100 == 0:
        #     loss = binary_cross_entropy(y, y_pred)
        #     print(f"Epoch {epoch}: Loss = {loss:.4f}")

    return weights, bias

# Prediction
def predict(X, weights, bias, threshold=0.5):
    z = np.dot(X, weights) + bias
    y_pred = sigmoid(z)
    return (y_pred >= threshold).astype(int)

# Metrics
def calculate_metrics(y_true, y_pred):
    TP = np.sum((y_pred == 1) & (y_true == 1))
    TN = np.sum((y_pred == 0) & (y_true == 0))
    FP = np.sum((y_pred == 1) & (y_true == 0))
    FN = np.sum((y_pred == 0) & (y_true == 1))

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return accuracy, precision, recall, f1_score

# Example Usage (Assuming `X` and `y` are preprocessed)
weights, bias = train_logistic_regression(X, y, learning_rate=0.01, epochs=1000)
y_pred = predict(X, weights, bias)

accuracy, precision, recall, f1_score = calculate_metrics(y, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1_score * 100:.2f}%")


Accuracy: 79.12%
Precision: 74.84%
Recall: 68.71%
F1 Score: 71.65%
