In [11]:
import numpy as np
import pandas as pd
import requests
from io import StringIO

# Load the dataset from URL
def load_data_from_url(url):
    response = requests.get(url)
    data = StringIO(response.text)
    df = pd.read_csv(data, header=None)
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    return X, y

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Initialize parameters
def initialize_parameters(n_features):
    return np.zeros((n_features, 1)), 0

# Compute cost and gradients
def compute_cost_and_gradients(X, y, W, b):
    m = X.shape[0]

    # Manually implemented sigmoid function
    def custom_sigmoid(z):
        return 1 / (1 + np.exp(-z))

    A = custom_sigmoid(np.dot(X, W) + b)

    # Add epsilon to avoid taking the logarithm of zero
    epsilon = 1e-10
    cost = -1/m * np.sum(y * np.log(A + epsilon) + (1 - y) * np.log(1 - A + epsilon))

    dW = 1/m * np.dot(X.T, (A - y))
    db = 1/m * np.sum(A - y)

    return cost, dW, db

# Train the model
def train_model(X, y, learning_rate, num_iterations):
    n_features = X.shape[1]
    W, b = initialize_parameters(n_features)

    for i in range(num_iterations):
        cost, dW, db = compute_cost_and_gradients(X, y, W, b)

        # Update parameters
        W -= learning_rate * dW
        b -= learning_rate * db

        if i % 100 == 0:
            print(f'Cost after iteration {i}: {cost}')

    return W, b

# Predict function
def predict(X, W, b):
    # Manually implemented sigmoid function
    def custom_sigmoid(z):
        return 1 / (1 + np.exp(-z))

    A = custom_sigmoid(np.dot(X, W) + b)
    return (A > 0.5).astype(int)

# Feature scaling
def scale_features(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X_scaled = (X - mean) / std
    return X_scaled

# Define learning rate and epochs
learning_rate = 0.01
num_iterations = 1000

# Define the URL of the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

# Load the data from the URL
X, y = load_data_from_url(url)

# Split data into training and testing sets
split_index = 4000  # Adjust this according to your dataset size and split ratio
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Scale features
X_train_scaled = scale_features(X_train)
X_test_scaled = scale_features(X_test)

# Train the model using scaled features
W, b = train_model(X_train_scaled, y_train.reshape(-1, 1), learning_rate, num_iterations)

# Make predictions using scaled features
y_pred_train = predict(X_train_scaled, W, b)
y_pred_test = predict(X_test_scaled, W, b)

# Evaluate the model
accuracy_train = np.mean(y_pred_train == y_train.reshape(-1, 1)) * 100
accuracy_test = np.mean(y_pred_test == y_test.reshape(-1, 1)) * 100

# Print summary of results
print("Model Evaluation:")
print(f"Training Accuracy: {accuracy_train:.2f}%")
print(f"Testing Accuracy: {accuracy_test:.2f}%")

# Print summary of predicted and actual values
print("Summary of Predicted and Actual Values:")
print("Training Set:")
print("Predicted   Actual")
for pred, actual in zip(y_pred_train[:5], y_train[:5]):
    print(f"{pred[0]}           {actual}")
print("...")
for pred, actual in zip(y_pred_train[-5:], y_train[-5:]):
    print(f"{pred[0]}           {actual}")

print("\nTesting Set:")
print("Predicted   Actual")
for pred, actual in zip(y_pred_test[:5], y_test[:5]):
    print(f"{pred[0]}           {actual}")
print("...")
for pred, actual in zip(y_pred_test[-5:], y_test[-5:]):
    print(f"{pred[0]}           {actual}")


Cost after iteration 0: 0.6931471803599452
Cost after iteration 100: 0.44230451922986835
Cost after iteration 200: 0.3693142629092678
Cost after iteration 300: 0.33389346068100284
Cost after iteration 400: 0.31243836721222806
Cost after iteration 500: 0.2977328577588806
Cost after iteration 600: 0.2868352380012465
Cost after iteration 700: 0.2783211332270439
Cost after iteration 800: 0.27141538984796443
Cost after iteration 900: 0.26565813031476715
Model Evaluation:
Training Accuracy: 91.70%
Testing Accuracy: 58.07%
Summary of Predicted and Actual Values:
Training Set:
Predicted   Actual
1           1
1           1
1           1
1           1
1           1
...
0           0
0           0
0           0
1           0
0           0

Testing Set:
Predicted   Actual
0           0
0           0
0           0
0           0
0           0
...
0           0
1           0
0           0
0           0
0           0
