<a href="https://colab.research.google.com/github/AndyDengFKu/DPA1/blob/main/DPL_A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.metrics import accuracy_score
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Load data from LIBSVM format
def load_libsvm_format(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    data = []
    labels = []
    max_feature_index = 0

    for line in lines:
        items = line.strip().split()
        labels.append(int(items[0]))

        features = {}
        for item in items[1:]:
            index, value = item.split(":")
            index = int(index)
            features[index] = float(value)
            if index > max_feature_index:
                max_feature_index = index

        data.append(features)

    # Convert to matrix format
    matrix_data = np.zeros((len(data), max_feature_index))
    for i, row in enumerate(data):
        for index, value in row.items():
            matrix_data[i][index - 1] = value

    return np.array(matrix_data), np.array(labels)

# Ensure that X_train and X_val have the same number of features
def align_features(X_train, X_val):
    num_features_train = X_train.shape[1]
    num_features_val = X_val.shape[1]

    if num_features_train > num_features_val:
        # Add missing columns to X_val
        missing_cols = np.zeros((X_val.shape[0], num_features_train - num_features_val))
        X_val = np.hstack((X_val, missing_cols))

    elif num_features_train < num_features_val:
        # Add missing columns to X_train
        missing_cols = np.zeros((X_train.shape[0], num_features_val - num_features_train))
        X_train = np.hstack((X_train, missing_cols))

    return X_train, X_val


# Load training data
X_train, y_train = load_libsvm_format("/content/drive/MyDrive/Colab Notebooks/DeepLearning/A1/a1.txt")

# Load validation data
X_val, y_val = load_libsvm_format("/content/drive/MyDrive/Colab Notebooks/DeepLearning/A1/a1a.txt")

# Align the features of training and validation data
X_train_aligned, X_val_aligned = align_features(X_train, X_val)

print(f"Training data shape: {X_train_aligned.shape}")
print(f"Validation data shape: {X_val_aligned.shape}")

Training data shape: (1605, 123)
Validation data shape: (30956, 123)


In [11]:
class Perceptron:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return np.where(linear_output > 0, 1, -1)

    def train(self, X, y):
        num_samples, num_features = X.shape

        # Initialize weights and bias
        self.weights = np.zeros(num_features)
        self.bias = 0

        # Training loop
        for _ in range(self.epochs):
            for idx, xi in enumerate(X):
                update = self.learning_rate * (y[idx] - self.predict(xi))
                self.weights += update * xi
                self.bias += update

# Initialize perceptron model
perceptron = Perceptron(learning_rate=0.01, epochs=5000)

# Train the model
perceptron.train(X_train_aligned, y_train)

# Predict on validation data
y_pred_val = perceptron.predict(X_val_aligned)

# Calculate accuracy
accuracy = np.mean(y_pred_val == y_val)

accuracy

0.8220054270577594

In [4]:
from sklearn.model_selection import KFold

def cross_validate(model, X, y, n_splits=5):
    """
    Perform cross-validation on the provided model and data.

    Parameters:
    - model: The machine learning model to be evaluated.
    - X: Features.
    - y: Labels.
    - n_splits: Number of splits for cross-validation.

    Returns:
    - List of accuracy scores for each fold.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_index, val_index in kf.split(X):
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        model.train(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)

        accuracy = np.mean(y_pred == y_val_fold)
        scores.append(accuracy)

    return scores

# Now performing the cross-validation on the perceptron model
cv_scores = cross_validate(Perceptron(learning_rate=0.01, epochs=1000), X_train_aligned, y_train)

cv_scores, np.mean(cv_scores)

([0.8099688473520249,
  0.8255451713395638,
  0.8130841121495327,
  0.7757009345794392,
  0.8348909657320872],
 0.8118380062305295)

In [12]:
# Using the original Perceptron model to experiment with different learning rates
class SimplePerceptron:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return np.where(linear_output > 0, 1, -1)

    def train(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0
        for _ in range(self.epochs):
            for idx, xi in enumerate(X):
                update = self.learning_rate * (y[idx] - self.predict(xi))
                self.weights += update * xi
                self.bias += update

# Experiment with different learning rates
learning_rates = [0.001, 0.01, 0.05, 0.1, 0.5]
accuracies = []

for lr in learning_rates:
    perceptron = SimplePerceptron(learning_rate=lr, epochs=1000)
    perceptron.train(X_train_scaled, y_train)
    y_pred_val = perceptron.predict(X_val_scaled)
    accuracy = np.mean(y_pred_val == y_val)
    accuracies.append(accuracy)

accuracies


[0.7911874919240212,
 0.7911874919240212,
 0.7911874919240212,
 0.7911874919240212,
 0.7911874919240212]

In [13]:
from sklearn.neural_network import MLPClassifier

# Initialize MLP model
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, activation='relu', solver='adam', random_state=1)

# Train the model
mlp.fit(X_train_scaled, y_train)

# Predict on validation data
y_pred_mlp = mlp.predict(X_val_scaled)

# Calculate accuracy
accuracy_mlp = np.mean(y_pred_mlp == y_val)
print("MLP Accuracy:", accuracy_mlp)

MLP Accuracy: 0.8150277813671017


In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# Setting up a parameter grid for grid search
param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (30, 30, 30)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam'],
    'learning_rate_init': [0.001, 0.01],
    'alpha': [0.0001, 0.001]  # L2 regularization term parameter
}

# Initializing GridSearch with MLPClassifier and the parameter grid
grid_search = GridSearchCV(MLPClassifier(max_iter=1000, random_state=1), param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Getting the best parameters and the associated accuracy
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Parameters: {'activation': 'logistic', 'alpha': 0.001, 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.01, 'solver': 'adam'}
Best Score: 0.8280373831775701


In [15]:
from sklearn.neural_network import MLPClassifier

# Initialize MLP model with the best parameters
mlp_best = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='logistic',
    solver='adam',
    learning_rate_init=0.01,
    alpha=0.001,
    max_iter=1000,
    random_state=1
)

# Train the model with the best parameters on the entire training set
mlp_best.fit(X_train_scaled, y_train)

# Predict on validation data
y_pred_mlp_best = mlp_best.predict(X_val_scaled)

# Calculate accuracy
accuracy_mlp_best = np.mean(y_pred_mlp_best == y_val)

print("MLP (Best Parameters) Accuracy on Validation Set:", accuracy_mlp_best)


MLP (Best Parameters) Accuracy on Validation Set: 0.8219408192272903
