<a href="https://colab.research.google.com/github/AndyDengFKu/DPA1/blob/main/DPL_A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.metrics import accuracy_score
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Load data from LIBSVM format
def load_libsvm_format(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    data = []
    labels = []
    max_feature_index = 0

    for line in lines:
        items = line.strip().split()
        labels.append(int(items[0]))

        features = {}
        for item in items[1:]:
            index, value = item.split(":")
            index = int(index)
            features[index] = float(value)
            if index > max_feature_index:
                max_feature_index = index

        data.append(features)

    # Convert to matrix format
    matrix_data = np.zeros((len(data), max_feature_index))
    for i, row in enumerate(data):
        for index, value in row.items():
            matrix_data[i][index - 1] = value

    return np.array(matrix_data), np.array(labels)

# Ensure that X_train and X_val have the same number of features
def align_features(X_train, X_val):
    num_features_train = X_train.shape[1]
    num_features_val = X_val.shape[1]

    if num_features_train > num_features_val:
        # Add missing columns to X_val
        missing_cols = np.zeros((X_val.shape[0], num_features_train - num_features_val))
        X_val = np.hstack((X_val, missing_cols))

    elif num_features_train < num_features_val:
        # Add missing columns to X_train
        missing_cols = np.zeros((X_train.shape[0], num_features_val - num_features_train))
        X_train = np.hstack((X_train, missing_cols))

    return X_train, X_val


# Load training data
X_train, y_train = load_libsvm_format("/content/drive/MyDrive/Colab Notebooks/DeepLearning/A1/a1.txt")

# Load validation data
X_val, y_val = load_libsvm_format("/content/drive/MyDrive/Colab Notebooks/DeepLearning/A1/a1a.txt")

# Align the features of training and validation data
X_train_aligned, X_val_aligned = align_features(X_train, X_val)

print(f"Training data shape: {X_train_aligned.shape}")
print(f"Validation data shape: {X_val_aligned.shape}")

Training data shape: (1605, 123)
Validation data shape: (30956, 123)


In [8]:
class Perceptron:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return np.where(linear_output > 0, 1, -1)

    def train(self, X, y):
        num_samples, num_features = X.shape

        # Initialize weights and bias
        self.weights = np.zeros(num_features)
        self.bias = 0

        # Training loop
        for _ in range(self.epochs):
            for idx, xi in enumerate(X):
                update = self.learning_rate * (y[idx] - self.predict(xi))
                self.weights += update * xi
                self.bias += update

# Initialize perceptron model
perceptron = Perceptron(learning_rate=0.01, epochs=1000)

# Train the model
perceptron.train(X_train_aligned, y_train)

# Predict on validation data
y_pred_val = perceptron.predict(X_val_aligned)

# Calculate accuracy
accuracy = np.mean(y_pred_val == y_val)

accuracy

0.8019446956971185

In [9]:
from sklearn.metrics import accuracy_score

# Define a grid of hyperparameters
learning_rates = [0.001, 0.01, 0.1]
epochs_list = [500, 1000, 2000]

best_accuracy = 0
best_lr = None
best_epochs = None

# Grid search for hyperparameter tuning
for lr in learning_rates:
    for epochs in epochs_list:
        perceptron = Perceptron(learning_rate=lr, epochs=epochs)
        perceptron.train(X_train_aligned, y_train)
        y_pred_val = perceptron.predict(X_val_aligned)
        accuracy = accuracy_score(y_val, y_pred_val)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_lr = lr
            best_epochs = epochs

best_lr, best_epochs, best_accuracy


(0.1, 500, 0.8243636128698798)

In [10]:
# Select a subset (10%) of the training data
subset_size = int(0.1 * X_train_aligned.shape[0])
X_train_subset = X_train_aligned[:subset_size]
y_train_subset = y_train[:subset_size]

best_accuracy = 0
best_lr = None
best_epochs = None

# Grid search for hyperparameter tuning on the subset
for lr in learning_rates:
    for epochs in epochs_list:
        perceptron = Perceptron(learning_rate=lr, epochs=epochs)
        perceptron.train(X_train_subset, y_train_subset)
        y_pred_val = perceptron.predict(X_val_aligned)
        accuracy = accuracy_score(y_val, y_pred_val)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_lr = lr
            best_epochs = epochs

best_lr, best_epochs, best_accuracy


(0.01, 2000, 0.7467050006460783)