In [10]:
import pandas as pd
import numpy as np

# Loading The Dataset

In [11]:
data = pd.read_csv('your_dataset.csv')

# Preparing data:
1. Seperating features and labels
2. Normalizing the dataset

In [12]:
def DataNormalize(inputData):
    mean = np.mean(inputData, axis = 0)
    std = np.std(inputData, axis = 0)
    normalizedData = (inputData - mean) / std
    
    return normalizedData

In [13]:
def DataShuffle(dataSize):
    indices = np.arange(dataSize)
    np.random.shuffle(indices)
    
    return indices

In [14]:
def DataSplit(inputData, outputData):
    # Define the split ratio
    trainRatio = 0.8
    trainSize = int(len(inputData) * trainRatio)
    
    # Split the data
    trainData = inputData[:trainSize]
    trainLabel = outputData[:trainSize]
    testData = inputData[trainSize:]
    testLabel = outputData[trainSize:]
    
    return trainData, trainLabel, testData, testLabel


In [15]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

X_normalized = DataNormalize(X)

# Shuffle the data
indices = DataShuffle(X.shape[0])
X_normalized = X_normalized[indices]
Y = Y[indices]
    
    
# Split the data
trainData, trainLabel, testData, testLabel = DataSplit(X_normalized, Y)

In [16]:
def zeroOneLoss(trueLabels, predLabels):
    return np.sum(trueLabels != predLabels) / len(trueLabels)

In [35]:
class LogisticRegression:
    def __init__(self, lambda_param, max_iter):
        self.lambda_param = lambda_param
        self.max_iter = max_iter
        self.weights = None
        
    def LogisticLossGradient(self, W, X, Y):
        z = Y * np.dot(X, W)
        gradient = (-Y * X) / (1 + np.exp(z))
        return gradient
    
    def fit(self, X, Y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        
        # Draw a random training example
        for t in range(1, self.max_iter + 1):
            i = np.random.randint(n_samples)
            x_i = X[i]
            y_i = Y[i]
            
            # Compute learning rate for this iteration
            eta_t = 1 / (self.lambda_param * t)
            
            # Compute gradient and update weight
            gradient = self.LogisticLossGradient(self.weights, x_i, y_i)
            self.weights = (1 - eta_t * self.lambda_param) * self.weights - eta_t * gradient
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def predict(self, X):
        if self.weights is None:
            raise ValueError("Model has not been trained yet.")
        
        z = np.dot(X, self.weights)
        probs = self.sigmoid(z)
        return np.where(probs >= 0.5, 1, -1)

# Cross Validation Implementation

In [36]:
# Grid Search using Cross Validateion
def crossValScore(X, y, params, k = 5):
    fold_size = len(X) // k
    accuracies = []
    
    for i in range(k):
        X_test = X[i * fold_size:(i + 1) * fold_size]
        y_test = y[i * fold_size:(i + 1) * fold_size]
        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]), axis=0)
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]), axis=0)
        
        model = LogisticRegression(lambda_param = params['lambda'], max_iter = params['iters'])
        
        model.fit(X_train, y_train)
        
        predictions = model.predict(X_test)
        accuracy = zeroOneLoss(predictions, y_test)
        accuracies.append(accuracy)
        
    return np.mean(accuracies)

# Tuning Hyperparameters

In [37]:
# Tuning Hyperparameters
lambda_list = [0.001, 0.01, 0.1]
n_iters_list = [1000, 1500, 2000]

best_loss = 2
best_params = {}
for lambdas in lambda_list:
    for n_iters in n_iters_list:
        params = {'lambda': lambdas,'iters': n_iters}
        mean_loss = crossValScore(trainData, trainLabel,params, k = 5)
        print(f"lambda: {lambdas}, n_iters: {n_iters}, Cross Validation Loss:{mean_loss:.4f}")

        if mean_loss < best_loss:
            best_loss = mean_loss
            best_params = params
            
print("\nBest Hyperparameters:")
print(f"lambda: {best_params['lambda']}")
print(f"n_iters: {best_params['iters']}")
print(f"Best Cross-Validation loss: {best_loss:.4f}")

  gradient = (-Y * X) / (1 + np.exp(z))


lambda: 0.001, n_iters: 1000, Cross Validation Loss:0.3464
lambda: 0.001, n_iters: 1500, Cross Validation Loss:0.3226
lambda: 0.001, n_iters: 2000, Cross Validation Loss:0.3196
lambda: 0.01, n_iters: 1000, Cross Validation Loss:0.3034
lambda: 0.01, n_iters: 1500, Cross Validation Loss:0.3061
lambda: 0.01, n_iters: 2000, Cross Validation Loss:0.2983
lambda: 0.1, n_iters: 1000, Cross Validation Loss:0.2823
lambda: 0.1, n_iters: 1500, Cross Validation Loss:0.2819
lambda: 0.1, n_iters: 2000, Cross Validation Loss:0.2794

Best Hyperparameters:
lambda: 0.1
n_iters: 2000
Best Cross-Validation loss: 0.2794


In [40]:
iterations = 10
losses = []
for i in range(iterations):

    # Shuffle the data
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X_normalized = X_normalized[indices]
    Y = Y[indices]

    # Define the split ratio
    trainRatio = 0.8
    trainSize = int(len(X_normalized) * trainRatio)

    # Split the data
    trainData = X_normalized[:trainSize]
    trainLabel = Y[:trainSize]

    testData = X_normalized[trainSize:]
    testLabel = Y[trainSize:]

    model = LogisticRegression(lambda_param = best_params['lambda'], max_iter = best_params['iters'])
        
    model.fit(trainData, trainLabel)

    predictions = model.predict(testData)
    
    loss = zeroOneLoss(testLabel, predictions)
    print(f"Iteration {i + 1} - Zero-One Loss: {loss:.4f}")
    losses.append(loss)

averageLoss = np.mean(losses)
print(f"Average Zero-One Loss: {averageLoss:.4f}")

Iteration 1 - Zero-One Loss: 0.2870
Iteration 2 - Zero-One Loss: 0.2665
Iteration 3 - Zero-One Loss: 0.2765
Iteration 4 - Zero-One Loss: 0.2785
Iteration 5 - Zero-One Loss: 0.2765
Iteration 6 - Zero-One Loss: 0.2875
Iteration 7 - Zero-One Loss: 0.2750
Iteration 8 - Zero-One Loss: 0.2885
Iteration 9 - Zero-One Loss: 0.2710
Iteration 10 - Zero-One Loss: 0.2685
Average Zero-One Loss: 0.2775


In [41]:
features = data.columns[:-1]
output = data.columns[-1]

for feature in features:
    data[feature + '_squared'] = data[feature] ** 2
    
columns_order = [col for col in data.columns if col != output] + [output]
data = data[columns_order]

In [42]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

mean = np.mean(X, axis = 0)
std = np.std(X, axis = 0)

X_normalized = (X - mean) / std

# Split the data
trainData, trainLabel, testData, testLabel = DataSplit(X_normalized, Y)