# Importing packages and running basic Random Forest

In [2]:
#!pip install torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import string
import numpy as np
import torch
import time
from tqdm import tqdm

N = 25000
MAX_WORD_LENGTH = 8

ALPHABET = string.ascii_lowercase + '_'

def one_hot_encode(strings, dists, device):
    strings = strings.tolist()
    dists = dists.tolist()
    indices_np = np.array([[ord(c) - ord('a') for c in s] for s in strings], dtype=np.int64)
    x_np = np.eye(len(ALPHABET), dtype=np.float32)[indices_np]
    x = torch.from_numpy(x_np).to(device)
    y = torch.as_tensor(dists, device=device)
    x = x.numpy()
    y = y.numpy()
    return x.reshape(x.shape[0],x.shape[1]*x.shape[2]), y

device = "cpu"

fileName = 'mixed_data' + str(MAX_WORD_LENGTH) + '.csv'
data = pd.read_csv(fileName, nrows=N, header=None, usecols=[0, 1], names=['strings', 'dists'])
data['dists'] = data['dists'].astype(int)

start_time = time.time()

train_data, test_data = train_test_split(data, test_size=0.2)
train_strings, train_dists = train_data['strings'], train_data['dists']
test_strings, test_dists = test_data['strings'], test_data['dists']
train_data_x, train_data_y = one_hot_encode(train_strings, train_dists, device)
test_data_x, test_data_y = one_hot_encode(test_strings, test_dists, device)

# train random forest on train data
model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)
#model.fit(train_data_x, train_data_y)

# creating a progress bar
pbar = tqdm(range(100), desc='Training Model', unit='iterations')

for i in pbar:
    model.fit(train_data_x, train_data_y)
    if i % 10 == 0:
        remaining_time = (time.time() - start_time) / (i + 1) * (100 - i - 1)
        pbar.set_postfix({'Estimated remaining time ': f'{int(remaining_time // 60)}m {int(remaining_time % 60)}s'})
        pbar.refresh()

end_time = time.time()

# evaluate model on test data
pred_y = model.predict(test_data_x)

# initializing a new array to add the rounded Hamming distances
rounded_pred_y = np.array([])

for prediction in pred_y:
    
    # round the predicted values to the nearest Hamming dist.
    rounded_pred_y = np.append(rounded_pred_y, int(np.round(prediction)))
    
mse = mean_squared_error(test_data_y, rounded_pred_y)
print(f'Mean squared error: {mse:.2f}')

correct_predictions = 0
total_predictions = len(test_data_y)

for i in range(total_predictions):
    if test_data_y[i] == rounded_pred_y[i]:
        correct_predictions += 1

classification_accuracy = correct_predictions / total_predictions
print(f'Classification accuracy: {classification_accuracy:.2f}')

#add grid search to tune hyperparameters
#maybe do ensemble with different random forests
#do k means to see if that is accurate

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')

Training Model: 100%|██████████| 100/100 [26:19<00:00, 15.79s/iterations, Estimated remaining time =2m 22s]

Mean squared error: 0.97
Classification accuracy: 0.42
Runtime: 1579.47 seconds





# Code below for grid search with Random Forest

In [3]:
import time
from sklearn.model_selection import GridSearchCV


# define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [20, 30, 40, 50, 60],
    'max_features': ['sqrt', 'log2']
}

start_time = time.time()

# create the random forest model
rf = RandomForestRegressor(random_state=42)

# create the grid search object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# fit the grid search object to the training data
grid_search.fit(train_data_x, train_data_y)

end_time = time.time()

# print the best hyperparameters and corresponding mean squared error
print("Best hyperparameters: ", grid_search.best_params_)
print("Mean squared error on training data: ", abs(grid_search.best_score_))

# evaluate the best model on the test data
best_model = grid_search.best_estimator_
pred_y = best_model.predict(test_data_x)

# initializing a new array to add the rounded Hamming distances
rounded_pred_y = np.array([])

for prediction in pred_y:
    
    # round the predicted values to the nearest Hamming dist.
    rounded_pred_y = np.append(rounded_pred_y, int(np.round(prediction)))
    
mse = mean_squared_error(test_data_y, rounded_pred_y)
print(f'Mean squared error: {mse:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')

KeyboardInterrupt: 

In [None]:
grid_search.best_params_['max_depth']

# DONT USE 
Ensemble using mean of the output to predict the Hamming distance

In [None]:
import time
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

ensemble_size = 3

# define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'max_features': ['sqrt', 'log2']
}

start_time = time.time()

# create an ensemble of 3 random forest models
models = []
for i in range(ensemble_size):
    rf = RandomForestRegressor(random_state = 42 + i)
    # each rf gets initialized with a different seed
    
    models.append(GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, scoring = 'neg_mean_squared_error', n_jobs = -1))
    # should we add n_jobs = -1 to let them run simultaneously on all available cpus?
    
# fit the grid search objects to the training data
for model in models:
    # track progress of the training using tqdm
    for _ in tqdm(range(100), desc = 'Fitting Model'):
        model.fit(train_data_x, train_data_y)

end_time = time.time()

# take the mean of the predictions from all models as the final prediction and round to nearest Hamming dist.
pred_y = sum(model.best_estimator_.predict(test_data_x) for model in models) / len(models)
pred_y = round(pred_y)

mse = mean_squared_error(test_data_y, pred_y)
print(f'Mean squared error on test data: {mse:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')

# DONT USE
Code for adding time to complete

In [None]:
import time
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

ensemble_size = 3

# define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'max_features': ['sqrt', 'log2']
}

start_time = time.time()

# create an ensemble of 3 random forest models
models = []
for i in range(ensemble_size):
    rf = RandomForestRegressor(random_state = 42 + i)
    # each rf gets initialized with a different seed
    
    models.append(GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, scoring = 'neg_mean_squared_error', n_jobs = -1))
    # should we add n_jobs = -1 to let them run simultaneously on all available cpus?
    
# fit the grid search objects to the training data
for model in models:
    # track progress of the training using tqdm
    pbar = tqdm(range(100), desc = 'Fitting Model', unit='iterations')
    for i in pbar:
        model.fit(train_data_x, train_data_y)
        if i % 10 == 0:
            remaining_time = (time.time() - start_time) / (i + 1) * (100 - i - 1)
            pbar.set_postfix({'Estimated remaining time': f'{int(remaining_time // 60)}m {int(remaining_time % 60)}s'})

end_time = time.time()

# take the mean of the predictions from all models as the final prediction and round to nearest Hamming dist.
pred_y = sum(model.best_estimator_.predict(test_data_x) for model in models) / len(models)
pred_y = round(pred_y)

mse = mean_squared_error(test_data_y, pred_y)
print(f'Mean squared error on test data: {mse:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')

# Removing the grid search, runtime w grid search was over 5 hours

In [None]:
import time
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

ensemble_size = 5

# define the parameters for the random forest models
params = {
    'n_estimators': 300,
    #'n_estimators' : grid_search.best_params_['n_estimators'],
    'max_depth': 50,
    #'max_depth' : grid_search.best_params_['max_depth'],
    'max_features': 'sqrt',
    #'max_features': grid_search.best_params_['max_features'],
    'random_state': 42
}

start_time = time.time()

# create an ensemble of 3 random forest models
models = []
for i in range(ensemble_size):
    rf = RandomForestRegressor(**params)
    # each rf gets initialized with the same set of parameters but a different seed
    rf.set_params(random_state=params['random_state'] + i)
    models.append(rf)
    
# fit the models to the training data
for model in models:
    # track progress of the training using tqdm
    pbar = tqdm(range(100), desc='Fitting Model', unit='iterations')
    for i in pbar:
        model.fit(train_data_x, train_data_y)
        if i % 10 == 0:
            remaining_time = (time.time() - start_time) / (i + 1) * (100 - i - 1)
            pbar.set_postfix({'Estimated remaining time ': f'{int(remaining_time // 60)}m {int(remaining_time % 60)}s'})
            pbar.refresh()  # refresh the tqdm bar and show the updated postfix on a separate line

end_time = time.time()

# take the mean of the predictions from all models as the final prediction and round to nearest Hamming dist.
#pred_y = sum(model.predict(test_data_x) for model in models) / len(models)
#pred_y = round(pred_y)

all_preds = []

for model in models:
    pred_y = model.predict(test_data_x)
    all_preds.append(pred_y)
    
avg_pred_y = np.vstack(all_preds).mean(axis = 0)
rounded_pred_y = np.round(avg_pred_y).astype(int)

mse = mean_squared_error(test_data_y, rounded_pred_y)
print(f'Mean squared error on test data: {mse:.2f}')

correct_predictions = 0
total_predictions = len(test_data_y)

for i in range(total_predictions):
    if test_data_y[i] == rounded_pred_y[i]:
        correct_predictions += 1

classification_accuracy = correct_predictions / total_predictions
print(f'Classification accuracy: {classification_accuracy:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')


Fitting Model:  63%|██████▎   | 63/100 [11:47<07:36, 12.33s/iterations, Estimated remaining time =7m 16s] 

In [None]:
print(avg_pred_y)
print(rounded_pred_y)
print(test_data_y)

# Code for creating KMeans k = 8 for the different possible Hamming distances of a string length 8

In [None]:
import time
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from tqdm import tqdm

# concatenate train and test data to perform clustering on entire dataset
data_x = np.concatenate((train_data_x, test_data_x))
data_y = np.concatenate((train_data_y, test_data_y))

start_time = time.time()

# perform k-means clustering with k=8
kmeans = KMeans(n_clusters=8, random_state=42)
kmeans.fit(data_x)

end_time = time.time()

# predict cluster assignments for train and test data
train_clusters = kmeans.predict(train_data_x)
test_clusters = kmeans.predict(test_data_x)

# print cluster assignments for first 10 samples of train data
print(train_clusters[:10])

# compute adjusted Rand index for train and test data
train_ari = adjusted_rand_score(train_data_y, train_clusters)
test_ari = adjusted_rand_score(test_data_y, test_clusters)

print(f'Train ARI: {train_ari:.2f}')
print(f'Test ARI: {test_ari:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')

In [None]:
train_data_y[:10]

In [None]:
import time
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from tqdm import tqdm

# concatenate train and test data to perform clustering on entire dataset
data_x = np.concatenate((train_data_x, test_data_x))
data_y = np.concatenate((train_data_y, test_data_y))

start_time = time.time()

# perform k-means clustering with k=8
kmeans = KMeans(n_clusters=8, random_state=42)

# wrap KMeans.fit() with tqdm to add progress bar
for i in tqdm(range(100), desc='KMeans.fit()', position=0):
    kmeans.fit(data_x)

end_time = time.time()

# predict cluster assignments for train and test data
train_clusters = kmeans.predict(train_data_x)
test_clusters = kmeans.predict(test_data_x)

# print cluster assignments for first 10 samples of train data
print(train_clusters[:10])

# compute adjusted Rand index for train and test data
train_ari = adjusted_rand_score(train_data_y, train_clusters)
test_ari = adjusted_rand_score(test_data_y, test_clusters)

print(f'Train ARI: {train_ari:.2f}')
print(f'Test ARI: {test_ari:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')


In [None]:
import time
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from tqdm import tqdm

# concatenate train and test data to perform clustering on entire dataset
data_x = np.concatenate((train_data_x, test_data_x))
data_y = np.concatenate((train_data_y, test_data_y))

start_time = time.time()

# perform k-means clustering with k=8
kmeans = KMeans(n_clusters=8, random_state=42)

# set number of iterations and calculate the total time
n_iter = 100
total_time = n_iter * kmeans.n_init * kmeans.max_iter

# wrap KMeans.fit() with tqdm to add progress bar and estimated time
for i in tqdm(range(n_iter), desc='KMeans.fit()', position=0, leave=True):
    kmeans.fit(data_x)
    # calculate remaining time after each iteration
    remaining_time = (total_time - ((i + 1) * kmeans.n_init * kmeans.max_iter)) / (i + 1)
    # display estimated time remaining in minutes
    tqdm.write(f'Estimated time remaining: {remaining_time / 60:.2f} minutes')

end_time = time.time()

# predict cluster assignments for train and test data
train_clusters = kmeans.predict(train_data_x)
test_clusters = kmeans.predict(test_data_x)

# print cluster assignments for first 10 samples of train data
print(train_clusters[:10])

# compute adjusted Rand index for train and test data
train_ari = adjusted_rand_score(train_data_y, train_clusters)
test_ari = adjusted_rand_score(test_data_y, test_clusters)

print(f'Train ARI: {train_ari:.2f}')
print(f'Test ARI: {test_ari:.2f}')

# print runtime
print(f'Runtime: {end_time - start_time:.2f} seconds')