# Alternate Algorithms to test against MLP NN

## Importing Packages

In [1]:
#!pip install torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import string
import numpy as np
import torch
import time
from tqdm import tqdm

## Loading in pre-tagged Hamming distance data calculated by brute force
Note, only 2.5k datapoints are being loaded into this Jupyter file due to the excessive run times of each of the following models.

In [2]:
N = 5000
MAX_WORD_LENGTH = 8

ALPHABET = string.ascii_lowercase + '_'

def one_hot_encode(strings, dists, device):
    strings = strings.tolist()
    dists = dists.tolist()
    indices_np = np.array([[ord(c) - ord('a') for c in s] for s in strings], dtype=np.int64)
    x_np = np.eye(len(ALPHABET), dtype=np.float32)[indices_np]
    x = torch.from_numpy(x_np).to(device)
    y = torch.as_tensor(dists, device=device)
    x = x.numpy()
    y = y.numpy()
    return x.reshape(x.shape[0],x.shape[1]*x.shape[2]), y

device = "cpu"

fileName = 'mixed_data' + str(MAX_WORD_LENGTH) + '.csv'
data = pd.read_csv(fileName, nrows=N, header=None, usecols=[0, 1], names=['strings', 'dists'])
data['dists'] = data['dists'].astype(int)

## Creating a single Random Forest with manually selected hyperparameters
Set hyperparameters to the following values:
   * n_estimators = 100
   * max_depth = 20
   * random_state = 42

In [3]:
start_time = time.time()

train_data, test_data = train_test_split(data, test_size=0.2)
train_strings, train_dists = train_data['strings'], train_data['dists']
test_strings, test_dists = test_data['strings'], test_data['dists']
train_data_x, train_data_y = one_hot_encode(train_strings, train_dists, device)
test_data_x, test_data_y = one_hot_encode(test_strings, test_dists, device)

# train random forest on train data
model = RandomForestRegressor(n_estimators = 100, max_depth = 20, random_state = 42)
#model.fit(train_data_x, train_data_y)

# creating a progress bar
pbar = tqdm(range(100), desc='Training Model', unit='iterations')

for i in pbar:
    model.fit(train_data_x, train_data_y)
    if i % 10 == 0:
        remaining_time = (time.time() - start_time) / (i + 1) * (100 - i - 1)
        pbar.set_postfix({'Estimated remaining time ': f'{int(remaining_time // 60)}m {int(remaining_time % 60)}s'})
        pbar.refresh()

end_time = time.time()

# evaluate model on test data
pred_y = model.predict(test_data_x)

# initializing a new array to add the rounded Hamming distances
rounded_pred_y = np.array([])

for prediction in pred_y:
    
    # round the predicted values to the nearest Hamming dist.
    rounded_pred_y = np.append(rounded_pred_y, int(np.round(prediction)))
    
mse = mean_squared_error(test_data_y, rounded_pred_y)
print(f'Mean squared error: {mse:.2f}')

correct_predictions = 0
total_predictions = len(test_data_y)

for i in range(total_predictions):
    if test_data_y[i] == rounded_pred_y[i]:
        correct_predictions += 1

classification_accuracy = correct_predictions / total_predictions
print(f'Classification accuracy: {classification_accuracy:.2f}')


# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')

Training Model: 100%|██████████| 100/100 [05:01<00:00,  3.02s/iterations, Estimated remaining time =0m 27s]

Mean squared error: 1.07
Classification accuracy: 0.39
Runtime: 301.99 seconds





## Implementing grid search over multiple hyperparameters with a random forest

In [4]:
# define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [20, 30, 40, 50, 60],
    'max_features': ['sqrt', 'log2']
}

start_time = time.time()

# create the random forest model
rf = RandomForestRegressor(random_state=42)

# create the grid search object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# fit the grid search object to the training data
grid_search.fit(train_data_x, train_data_y)

end_time = time.time()

# print the best hyperparameters and corresponding mean squared error
print("Best hyperparameters: ", grid_search.best_params_)
print("Mean squared error on training data: ", abs(grid_search.best_score_))

# evaluate the best model on the test data
best_model = grid_search.best_estimator_
pred_y = best_model.predict(test_data_x)

# initializing a new array to add the rounded Hamming distances
rounded_pred_y = np.array([])

for prediction in pred_y:
    
    # round the predicted values to the nearest Hamming dist.
    rounded_pred_y = np.append(rounded_pred_y, int(np.round(prediction)))
    
mse = mean_squared_error(test_data_y, rounded_pred_y)
print(f'Mean squared error: {mse:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')

Best hyperparameters:  {'max_depth': 60, 'max_features': 'sqrt', 'n_estimators': 300}
Mean squared error on training data:  0.9180986978517597
Mean squared error: 1.00
Runtime: 144.80 seconds


## Creating an ensemble of 3 random forests
Originally, we had wanted to do grid search individually for each forest to test against the MLP NN; however, that was very computationally expensive and the run time was absurd. Instead of passing the parameters from the previous grid search as the parameters of each random forest in the ensemble directly, we chose to adjust the hyperparameters randomly by adjusting them by a positive or negative np.random.randint()

In [12]:
ensemble_size = 3

# define the parameters for the random forest models
params = {
    #'n_estimators': 300,
    'n_estimators' : grid_search.best_params_['n_estimators'],
    #'max_depth': 50,
    'max_depth' : grid_search.best_params_['max_depth'],
    #'max_features': 'sqrt',
    'max_features': grid_search.best_params_['max_features'],
    'random_state': 42
}

start_time = time.time()

# create an ensemble of 3 random forest models
models = []
for i in range(ensemble_size):
    rf = RandomForestRegressor(**params)
    # each rf gets initialized with the same set of parameters but a different seed
    rf.set_params(random_state=params['random_state'] + i)
    
    if i % 2 == 0:
        rf.set_params(n_estimators = params['n_estimators'] + np.random.randint(10, high = 100))
        rf.set_params(max_depth = params['max_depth'] - np.random.randint(1, high = 10))
        models.append(rf)
    else:
        rf.set_params(n_estimators = params['n_estimators'] - np.random.randint(10, high = 100))
        rf.set_params(max_depth = params['max_depth'] + np.random.randint(1, high = 10))
        models.append(rf)
    
# fit the models to the training data
for model in models:
    # track progress of the training using tqdm
    pbar = tqdm(range(100), desc='Fitting Model', unit='iterations')
    for i in pbar:
        model.fit(train_data_x, train_data_y)
        if i % 10 == 0:
            remaining_time = (time.time() - start_time) / (i + 1) * (100 - i - 1)
            pbar.set_postfix({'Estimated remaining time ': f'{int(remaining_time // 60)}m {int(remaining_time % 60)}s'})
            pbar.refresh()  # refresh the tqdm bar and show the updated postfix on a separate line

end_time = time.time()

# take the mean of the predictions from all models as the final prediction and round to nearest Hamming dist.
#pred_y = sum(model.predict(test_data_x) for model in models) / len(models)
#pred_y = round(pred_y)

all_preds = []

for model in models:
    pred_y = model.predict(test_data_x)
    all_preds.append(pred_y)
    
avg_pred_y = np.vstack(all_preds).mean(axis = 0)
rounded_pred_y = np.round(avg_pred_y).astype(int)

mse = mean_squared_error(test_data_y, rounded_pred_y)
print(f'Mean squared error on test data: {mse:.2f}')

correct_predictions = 0
total_predictions = len(test_data_y)

for i in range(total_predictions):
    if test_data_y[i] == rounded_pred_y[i]:
        correct_predictions += 1

classification_accuracy = correct_predictions / total_predictions
print(f'Classification accuracy: {classification_accuracy:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')

Fitting Model: 100%|██████████| 100/100 [04:25<00:00,  2.66s/iterations, Estimated remaining time =0m 23s]
Fitting Model: 100%|██████████| 100/100 [02:33<00:00,  1.54s/iterations, Estimated remaining time =0m 40s] 
Fitting Model: 100%|██████████| 100/100 [04:08<00:00,  2.49s/iterations, Estimated remaining time =1m 3s]  


Mean squared error on test data: 0.97
Classification accuracy: 0.36
Runtime: 668.06 seconds
