# Task 2: Classification and PCA on mobile robot sensory data

In [1]:
import pandas as pd
import numpy as np

Download the data from here:

https://www.kaggle.com/competitions/career-con-2019/data

You only need the original training set, use it for both training and testing.  (The X_test wasn't published.)

In [2]:
# import data gathered during robot navigation over different surfaces
path = "../Task2/"
dfX = pd.read_csv(path + "X_train.csv") # making dataframe of training data from file
dfy = pd.read_csv(path + "y_train.csv") # making dataframe of training labels from file

# preparing the feature space
data = dfX.to_numpy()  # convert data frame into numpy bidimensional array
data = data[:, 3:]  # remove first 3 columns of identifiers
X = np.reshape(data,(int(data.shape[0]/128), 10*128)) # reshape so that 128 timestamps for the same trial are tiled into a single row

# preparing the labels vector
surfaces = dfy['surface']  # list of labels in strings
types = list(set(surfaces))  # set of unique labels
y = [types.index(s) for s in surfaces]  # numerical list of labels

Task 2

A. change the network structure (number of layers and neurons), and parameters (transfer functions, learning rate, algorithms, stop conditions): how does classification accuracy change? What about training time?

B. apply PCA on the dataset, varying the number of dimensions you keep. Perform classification on the reduced datasets. How does accuracy change with different dimensions? What does that tell you about the original dataset?

Perform systematic tests on appropriate values and ranges and report your results, answering the questions.

TASK2-A

In [None]:
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {len(y)}")
print(f"X: ,{X}")
print(f"y: , {y}")

## Split the data into train and test sets:

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Importing necessary libraries and define hyperparameters:


In [5]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import time

# Define hyperparameters
hidden_layer_sizes = (200, 100, 50)
activation = 'relu'
solver = 'adam'
learning_rate = 0.001
max_iter = 1000


## Initializing the MLP classifier with partial fit and fit it once to initialize classes:


In [None]:
# Initialize the MLP classifier with partial fit
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
                    solver=solver, learning_rate_init=learning_rate, max_iter=max_iter)

# Fit the classifier once to initialize classes
clf.partial_fit(X_train, y_train, np.unique(y_train))


## Training and evaluation loop, and collecting results:


In [None]:
# Define hyperparameters
hidden_layer_sizes = (200, 100, 50)
activation = 'relu'
solver = 'adam'
learning_rate = 0.001
max_iter = 1000

# Initialize lists to store accuracies and times
accuracies = []
times = []

# Training and evaluation loop
start_time = time.time()
for i in range(max_iter):
    clf.partial_fit(X_train, y_train)
    train_accuracy = clf.score(X_train, y_train)
    accuracies.append(train_accuracy)
    elapsed_time = time.time() - start_time
    times.append(elapsed_time)
    
    # Print accuracy and time each time they are updated
    print(f"Iteration {i + 1} - Training Accuracy: {train_accuracy:.3f}, Elapsed Time: {elapsed_time:.3f} seconds")
    
    # Check if training accuracy reaches the desired threshold
    if train_accuracy >= 0.99:
        print(f"Training accuracy reached 0.99. Stopping training.")
        break

# Get final training time and accuracy
training_time = times[-1]
final_accuracy = accuracies[-1]

# Print final results
print(f"\nFinal Training Time: {training_time:.3f} seconds")
print(f"Final Training Accuracy: {final_accuracy:.3f}")



## Print final results and plot accuracy over training time:


In [None]:
# Print final results
print(f"Hidden layers: {hidden_layer_sizes}, Activation: {activation}, Solver: {solver}, "
      f"Learning rate: {learning_rate}, Max iter: {max_iter}, Training Time: {training_time:.3f} seconds, "
      f"Final Accuracy: {final_accuracy:.3f}")

# Plot accuracy over training time
plt.figure(figsize=(8, 6))
plt.plot(times, accuracies)
plt.xlabel('Training Time (seconds)')
plt.ylabel('Accuracy')
plt.title(f'Accuracy Over Training Time for MLPClassifier\n{hidden_layer_sizes}, {activation}, {solver}, lr={learning_rate}, max_iter={max_iter}')
plt.grid(True)
plt.show()


## Experiment: Changing Hidden Layer Sizes

In [None]:
# Define hyperparameters
hidden_layer_sizes = (200, 200, 100, 100, 50, 50)
activation = 'relu'
solver = 'adam'
learning_rate = 0.001
max_iter = 1000

# Initialize lists to store accuracies and times
accuracies = []
times = []

# Initialize MLP classifier
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
solver=solver, learning_rate_init=learning_rate, max_iter=1)
# Fit the classifier once to initialize classes
clf.partial_fit(X_train, y_train, np.unique(y_train))


# Training and evaluation loop
start_time = time.time()
for i in range(max_iter):
    clf.partial_fit(X_train, y_train)
    train_accuracy = clf.score(X_train, y_train)
    accuracies.append(train_accuracy)
    elapsed_time = time.time() - start_time
    times.append(elapsed_time)

    # Print accuracy and time each time they are updated
    print(f"Iteration {i + 1} - Training Accuracy: {train_accuracy:.3f}, Elapsed Time: {elapsed_time:.3f} seconds")

    # Check if training accuracy reaches the desired threshold
    if train_accuracy >= 0.99:
        print(f"Training accuracy reached 0.99. Stopping training.")
        break

# Get final training time and accuracy
training_time = times[-1]
final_accuracy = accuracies[-1]

# Print final results
print(f"\nFinal Training Time: {training_time:.3f} seconds")
print(f"Final Training Accuracy: {final_accuracy:.3f}")

# Plot accuracy over training time
plt.figure(figsize=(8, 6))
plt.plot(times, accuracies)
plt.xlabel('Training Time (seconds)')
plt.ylabel('Accuracy')
plt.title(f'Accuracy Over Training Time for MLPClassifier\n{hidden_layer_sizes}, {activation}, {solver}, lr={learning_rate}, max_iter={max_iter}')
plt.grid(True)
plt.show()




## Experiment: Trying Different Activation Functions


In [None]:
# Define hyperparameters
hidden_layer_sizes = (200, 100, 50)
activation = 'identity'
solver = 'adam'
learning_rate = 0.001
max_iter = 1000

# Initialize lists to store accuracies and times
accuracies = []
times = []

# Initialize MLP classifier
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
solver=solver, learning_rate_init=learning_rate, max_iter=1)
# Fit the classifier once to initialize classes
clf.partial_fit(X_train, y_train, np.unique(y_train))

# Training and evaluation loop
start_time = time.time()
for i in range(max_iter):
    clf.partial_fit(X_train, y_train)
    train_accuracy = clf.score(X_train, y_train)
    accuracies.append(train_accuracy)
    elapsed_time = time.time() - start_time
    times.append(elapsed_time)

    # Print accuracy and time each time they are updated
    print(f"Iteration {i + 1} - Training Accuracy: {train_accuracy:.3f}, Elapsed Time: {elapsed_time:.3f} seconds")

    # Check if training accuracy reaches the desired threshold
    if train_accuracy >= 0.99:
        print(f"Training accuracy reached 0.99. Stopping training.")
        break

# Get final training time and accuracy
training_time = times[-1]
final_accuracy = accuracies[-1]

# Print final results
print(f"\nFinal Training Time: {training_time:.3f} seconds")
print(f"Final Training Accuracy: {final_accuracy:.3f}")

# Plot accuracy over training time
plt.figure(figsize=(8, 6))
plt.plot(times, accuracies)
plt.xlabel('Training Time (seconds)')
plt.ylabel('Accuracy')
plt.title(f'Accuracy Over Training Time for MLPClassifier\n{hidden_layer_sizes}, {activation}, {solver}, lr={learning_rate}, max_iter={max_iter}')
plt.grid(True)
plt.show()




## Experiment: Adjusting Learning Rates


In [None]:
# Define hyperparameters
hidden_layer_sizes = (200, 100, 50)
activation = 'relu'
solver = 'adam'
learning_rate = 0.01
max_iter = 1000

# Initialize lists to store accuracies and times
accuracies = []
times = []

# Initialize MLP classifier
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
solver=solver, learning_rate_init=learning_rate, max_iter=1)
# Fit the classifier once to initialize classes
clf.partial_fit(X_train, y_train, np.unique(y_train))


# Training and evaluation loop
start_time = time.time()
for i in range(max_iter):
    clf.partial_fit(X_train, y_train)
    train_accuracy = clf.score(X_train, y_train)
    accuracies.append(train_accuracy)
    elapsed_time = time.time() - start_time
    times.append(elapsed_time)

    # Print accuracy and time each time they are updated
    print(f"Iteration {i + 1} - Training Accuracy: {train_accuracy:.3f}, Elapsed Time: {elapsed_time:.3f} seconds")

    # Check if training accuracy reaches the desired threshold
    if train_accuracy >= 0.99:
        print(f"Training accuracy reached 0.99. Stopping training.")
        break

# Get final training time and accuracy
training_time = times[-1]
final_accuracy = accuracies[-1]

# Print final results
print(f"\nFinal Training Time: {training_time:.3f} seconds")
print(f"Final Training Accuracy: {final_accuracy:.3f}")

# Plot accuracy over training time
plt.figure(figsize=(8, 6))
plt.plot(times, accuracies)
plt.xlabel('Training Time (seconds)')
plt.ylabel('Accuracy')
plt.title(f'Accuracy Over Training Time for MLPClassifier\n{hidden_layer_sizes}, {activation}, {solver}, lr={learning_rate}, max_iter={max_iter}')
plt.grid(True)
plt.show()


## Experiment: Exploring Different Solvers


In [None]:
# Define hyperparameters
hidden_layer_sizes = (200, 100, 50)
activation = 'relu'
solver = 'sgd'
learning_rate = 0.001
max_iter = 1000

# Initialize lists to store accuracies and times
accuracies = []
times = []

# Initialize MLP classifier
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
solver=solver, learning_rate_init=learning_rate, max_iter=1)
# Fit the classifier once to initialize classes
clf.partial_fit(X_train, y_train, np.unique(y_train))


# Training and evaluation loop
start_time = time.time()
for i in range(max_iter):
    clf.partial_fit(X_train, y_train)
    train_accuracy = clf.score(X_train, y_train)
    accuracies.append(train_accuracy)
    elapsed_time = time.time() - start_time
    times.append(elapsed_time)

    # Print accuracy and time each time they are updated
    print(f"Iteration {i + 1} - Training Accuracy: {train_accuracy:.3f}, Elapsed Time: {elapsed_time:.3f} seconds")

    # Check if training accuracy reaches the desired threshold
    if train_accuracy >= 0.99:
        print(f"Training accuracy reached 0.99. Stopping training.")
        break

# Get final training time and accuracy
training_time = times[-1]
final_accuracy = accuracies[-1]

# Print final results
print(f"\nFinal Training Time: {training_time:.3f} seconds")
print(f"Final Training Accuracy: {final_accuracy:.3f}")

# Plot accuracy over training time
plt.figure(figsize=(8, 6))
plt.plot(times, accuracies)
plt.xlabel('Training Time (seconds)')
plt.ylabel('Accuracy')
plt.title(f'Accuracy Over Training Time for MLPClassifier\n{hidden_layer_sizes}, {activation}, {solver}, lr={learning_rate}, max_iter={max_iter}')
plt.grid(True)
plt.show()


## Experiment: Varying Max Iterations


In [None]:
# Define hyperparameters
hidden_layer_sizes = (200, 100, 50)
activation = 'identity'
solver = 'adam'
learning_rate = 0.001
max_iter = 5000

# Initialize lists to store accuracies and times
accuracies = []
times = []

# Initialize MLP classifier
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation,
solver=solver, learning_rate_init=learning_rate, max_iter=1)
# Fit the classifier once to initialize classes
clf.partial_fit(X_train, y_train, np.unique(y_train))


# Training and evaluation loop
start_time = time.time()
for i in range(max_iter):
    clf.partial_fit(X_train, y_train)
    train_accuracy = clf.score(X_train, y_train)
    accuracies.append(train_accuracy)
    elapsed_time = time.time() - start_time
    times.append(elapsed_time)

    # Print accuracy and time each time they are updated
    print(f"Iteration {i + 1} - Training Accuracy: {train_accuracy:.3f}, Elapsed Time: {elapsed_time:.3f} seconds")

    # Check if training accuracy reaches the desired threshold
    if train_accuracy >= 0.99:
        print(f"Training accuracy reached 0.99. Stopping training.")
        break

# Get final training time and accuracy
training_time = times[-1]
final_accuracy = accuracies[-1]

# Print final results
print(f"\nFinal Training Time: {training_time:.3f} seconds")
print(f"Final Training Accuracy: {final_accuracy:.3f}")

# Plot accuracy over training time
plt.figure(figsize=(8, 6))
plt.plot(times, accuracies)
plt.xlabel('Training Time (seconds)')
plt.ylabel('Accuracy')
plt.title(f'Accuracy Over Training Time for MLPClassifier\n{hidden_layer_sizes}, {activation}, {solver}, lr={learning_rate}, max_iter={max_iter}')
plt.grid(True)
plt.show()

# Task B: Apply PCA and perform classification on reduced datasets:



In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load your dataset (X and y)

# Define the range of n_components to try
n_components_range = range(10, X.shape[1], 40)  # Adjust the range as needed

# Define hyperparameters for MLPClassifier
hidden_layer_sizes = (200, 100, 50)
activation = 'relu'
solver = 'adam'
learning_rate = 0.001
max_iter = 1000

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize lists to store accuracies
pca_accuracies = []

# Loop over different values of n_components
for n_components in n_components_range:
    # Initialize PCA
    pca = PCA(n_components=n_components)

    # Transform the training and testing data using PCA
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Initialize MLPClassifier
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=max_iter)

    # Train the classifier on the PCA-transformed training data
    clf.fit(X_train_pca, y_train)

    # Evaluate the classifier on the PCA-transformed testing data
    y_pred = clf.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the accuracy for the current n_components
    pca_accuracies.append(accuracy)
    print(f"Number of components: {n_components}, Accuracy: {accuracy:.3f}")

# Plot the accuracies
plt.plot(n_components_range, pca_accuracies, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Number of Components')
plt.grid(True)
plt.show()
