<a href="https://colab.research.google.com/github/FaiadS/dissertation/blob/main/diss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# EA Setup
pip install deap

Collecting deap
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deap
Successfully installed deap-1.4.1


In [None]:
# Setup
from google.colab import drive
drive.mount('/content/drive')
import csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

Mounted at /content/drive


In [None]:
# Download Data
df = pd.read_csv('/content/drive/MyDrive/Diss/dataAll.csv')
# df = pd.read_csv('/content/drive/MyDrive/Diss/dataIPF.csv')
print(df.columns)
print(len(df.columns))
columns_to_drop = ['BirthYearClass', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg',
                   'Squat4Kg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg',
                   'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Deadlift4Kg',
                   'Dots', 'Wilks', 'Glossbrenner', 'Goodlift', 'Tested',
                   'Country', 'State', 'ParentFederation', 'MeetCountry',
                   'MeetState', 'MeetTown', 'MeetName', 'TotalKg', 'Division']
df = df.drop(columns=columns_to_drop)
print(df.columns)
print(df.shape)

  df = pd.read_csv('/content/drive/MyDrive/Diss/dataAll.csv')


Index(['Name', 'Sex', 'Event', 'Equipment', 'Age', 'AgeClass',
       'BirthYearClass', 'Division', 'BodyweightKg', 'WeightClassKg',
       'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Squat4Kg', 'Best3SquatKg',
       'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg', 'Best3BenchKg',
       'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Deadlift4Kg',
       'Best3DeadliftKg', 'TotalKg', 'Place', 'Dots', 'Wilks', 'Glossbrenner',
       'Goodlift', 'Tested', 'Country', 'State', 'Federation',
       'ParentFederation', 'Date', 'MeetCountry', 'MeetState', 'MeetTown',
       'MeetName'],
      dtype='object')
41
Index(['Name', 'Sex', 'Event', 'Equipment', 'Age', 'AgeClass', 'BodyweightKg',
       'WeightClassKg', 'Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg',
       'Place', 'Federation', 'Date'],
      dtype='object')
(3054071, 14)


In [None]:
# Data Processing Functions
# Encode Sex, Federation and Equipment (One-hot for last two)
def encode_data(data):
  data['Sex'] = data['Sex'].replace({'M': 0, 'F': 1, 'Mx': 2})
  data.drop(data[data['Sex'] == 2].index, inplace=True)
  data['Sex'] = data['Sex'].astype(float)

  print(len(data.columns))
  data = pd.get_dummies(data, columns=['Federation', 'Equipment'])
  print(len(data.columns))
  return data

# Remove disqualified rows
def remove_DQ(data):
  data = data[~data['Place'].isin(['DQ', 'G', 'DD', 'NS'])]
  data = data.drop('Place' , axis=1)
  return data

# remove data by date
def remove_pre1996(data):
  data['Date'] = pd.to_datetime(data['Date'])
  data = data[data['Date'].dt.year >= 1996]
  data = data[data['Date'].dt.year < 2024]
  return data

# remove athletes with only one entry
def remove_one_comp(data):
  data = data.groupby('Name').filter(lambda x: len(x) > 1)
  data = data.sort_values(by=['Name', 'Date'])
  return data

# remove name
def anonymise(data):
  data = data.drop(columns=['Name'])
  return data

# calculate time difference between events
def day_difference(data):
  data = data.sort_values(by='Date')
  data['Previous Comp Date'] = data['Date'].shift(1)
  data['Day Difference'] = (data['Date'] - data['Previous Comp Date']).dt.days.fillna(0)
  data = data.drop(columns=['Previous Comp Date'])
  return data

# seperate squat data
def seperate_squat(data):
  data = data.drop(columns=['Best3BenchKg', 'Best3DeadliftKg'])
  data = data[data['Event'].str.contains("[^S]", case=True)]
  data = data.drop(columns=['Event'])
  data["PreviousBest3SquatKg"] = data.groupby("Name")["Best3SquatKg"].shift(1)
  data = data.groupby('Name', group_keys=False).apply(day_difference)
  data = data[data["Day Difference"] != 0]
  return data

# seperate bench press data
def seperate_bench(data):
  data = data.drop(columns=['Best3SquatKg', 'Best3DeadliftKg'])
  data = data[data['Event'].str.contains("[^B]", case=True)]
  data = data.drop(columns=['Event'])
  data["PreviousBest3BenchKg"] = data.groupby("Name")["Best3BenchKg"].shift(1)
  data = data.groupby('Name', group_keys=False).apply(day_difference)
  data = data[data["Day Difference"] != 0]
  return data

# seperate deadlift data
def seperate_deads(data):
  data = data.drop(columns=['Best3BenchKg', 'Best3SquatKg'])
  data = data[data['Event'].str.contains("[^D]", case=True)]
  data = data.drop(columns=['Event'])
  data["PreviousBest3DeadliftKg"] = data.groupby("Name")["Best3DeadliftKg"].shift(1)
  data = data.groupby('Name', group_keys=False).apply(day_difference)
  data = data[data["Day Difference"] != 0]
  return data

# remove empty rows
def remove_empties(data):
  data = data.dropna(subset=['Age'])
  data = data.dropna(subset=['BodyweightKg'])
  if 'Best3SquatKg' in data.columns:
    data = data.dropna(subset=['Best3SquatKg'])
    data = data.dropna(subset=['PreviousBest3SquatKg'])
  if 'Best3BenchKg' in data.columns:
    data = data.dropna(subset=['Best3BenchKg'])
    data = data.dropna(subset=['PreviousBest3BenchKg'])
  if 'Best3DeadliftKg' in data.columns:
    data = data.dropna(subset=['Best3DeadliftKg'])
    data = data.dropna(subset=['PreviousBest3DeadliftKg'])
  return data

# convert date to float
def date_to_float(data):
  data['Date'] = (data['Date'] - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s')
  return data

# helper functions for handling empty data
def mid_ageClass(string):
  if isinstance(string, str):
    range = string.split("-")
    return (int(range[0]) + int(range[1])) / 2
  else:
    return string

def bw_class(string):
  if isinstance(string, str):
    if "+" in string:
      return np.nan
    else:
      return float(string)
  else:
    return string

# handle empty bodyweight data
def empty_bw(data):
  data['BodyweightKg'] = data['BodyweightKg'].fillna(
      data['WeightClassKg'].apply(bw_class))
  data = data.drop('WeightClassKg', axis=1)
  return data

# handle empty age data
def empty_age(data):
  data['Age'] = data['Age'].fillna(data['AgeClass'].apply(mid_ageClass))
  data = data.drop('AgeClass', axis=1)
  return data

In [None]:
# Preprocess
df = remove_DQ(df)
df = remove_pre1996(df)
df = remove_one_comp(df)
df = empty_age(df)
df = empty_bw(df)
df = encode_data(df)
print(df.head)

11
387
<bound method NDFrame.head of                  Name  Sex Event   Age  BodyweightKg  Best3SquatKg  \
1589029     A Abrutis  0.0   SBD  32.5        123.60         310.0   
1591748     A Abrutis  0.0   SBD  33.5        122.80         370.0   
1583616  A Arun Kumar  0.0     B  41.0         90.00           NaN   
1584109  A Arun Kumar  0.0     B  41.0         90.90           NaN   
1594928      A Ashwin  0.0   SBD  16.5         81.70         180.0   
...               ...  ...   ...   ...           ...           ...   
745265            龙翔宇  0.0     D  22.0         83.30           NaN   
719315           﨑村 厚  0.0     B  68.5         73.15           NaN   
709205           﨑村 厚  0.0     B  69.5         73.60           NaN   
710552           﨑村 厚  0.0     B  70.5         73.10           NaN   
713217           﨑村 厚  0.0     B  71.5         73.65           NaN   

         Best3BenchKg  Best3DeadliftKg       Date  Federation_365Strong  ...  \
1589029         210.0            322.5 200

In [None]:
# S seperate squat data
squat_data = seperate_squat(df)
squat_data = remove_empties(squat_data)
feature_data = anonymise(squat_data)
feature_data = date_to_float(feature_data)
print(feature_data.head)
print(df.shape)

<bound method NDFrame.head of          Sex   Age  BodyweightKg  Best3SquatKg          Date  \
1591748  0.0  33.5        122.80         370.0  1.049501e+09   
1600429  0.0  16.5         82.55         170.0  1.355098e+09   
1589655  0.0  29.5        108.30         300.0  1.080950e+09   
1591725  0.0  43.5         81.50         212.5  1.049501e+09   
1589634  0.0  44.5         81.80         215.0  1.080950e+09   
...      ...   ...           ...           ...           ...   
719710   0.0  46.0        100.00         232.5  1.442707e+09   
705854   0.0  47.0        103.00         225.0  1.478390e+09   
711218   0.0  22.5        127.00         260.0  1.518221e+09   
708686   0.0  19.5         62.35         130.0  1.534032e+09   
706326   0.0  20.5         73.70         230.0  1.549670e+09   

         Federation_365Strong  Federation_AAP  Federation_AAU  Federation_ABP  \
1591748                 False           False           False           False   
1600429                 False          

In [None]:
# B seperate bench data
bench_data = seperate_bench(df)
bench_data = remove_empties(bench_data)
feature_data = anonymise(bench_data)
feature_data = date_to_float(feature_data)
print(feature_data.head)
print(df.shape)

<bound method NDFrame.head of          Sex   Age  BodyweightKg  Best3BenchKg          Date  \
1591748  0.0  33.5        122.80         242.5  1.049501e+09   
1600429  0.0  16.5         82.55          95.0  1.355098e+09   
1589655  0.0  29.5        108.30         182.5  1.080950e+09   
1591725  0.0  43.5         81.50         175.0  1.049501e+09   
1589634  0.0  44.5         81.80         172.5  1.080950e+09   
...      ...   ...           ...           ...           ...   
719710   0.0  46.0        100.00         147.5  1.442707e+09   
705854   0.0  47.0        103.00         145.0  1.478390e+09   
711218   0.0  22.5        127.00         182.5  1.518221e+09   
708686   0.0  19.5         62.35          95.0  1.534032e+09   
706326   0.0  20.5         73.70         135.0  1.549670e+09   

         Federation_365Strong  Federation_AAP  Federation_AAU  Federation_ABP  \
1591748                 False           False           False           False   
1600429                 False          

In [None]:
# D seperate deadlift data
deadlift_data = seperate_deads(df)
deadlift_data = remove_empties(deadlift_data)
feature_data = anonymise(deadlift_data)
feature_data = date_to_float(feature_data)
print(feature_data.head)
print(df.shape)

<bound method NDFrame.head of          Sex   Age  BodyweightKg  Best3DeadliftKg          Date  \
1591748  0.0  33.5        122.80            350.0  1.049501e+09   
1600429  0.0  16.5         82.55            220.0  1.355098e+09   
1589655  0.0  29.5        108.30            305.0  1.080950e+09   
1591725  0.0  43.5         81.50            217.5  1.049501e+09   
1589634  0.0  44.5         81.80            205.0  1.080950e+09   
...      ...   ...           ...              ...           ...   
719710   0.0  46.0        100.00            212.5  1.442707e+09   
705854   0.0  47.0        103.00            215.0  1.478390e+09   
711218   0.0  22.5        127.00            272.5  1.518221e+09   
708686   0.0  19.5         62.35            160.0  1.534032e+09   
706326   0.0  20.5         73.70            260.0  1.549670e+09   

         Federation_365Strong  Federation_AAP  Federation_AAU  Federation_ABP  \
1591748                 False           False           False           False   
160

In [None]:
# S train, val, test
from sklearn.model_selection import train_test_split

target = "Best3SquatKg"
features = feature_data.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test = train_test_split(feature_data[features],
                                                    feature_data[target],
                                                    test_size=0.2,
                                                    random_state=69)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=69)

In [None]:
# B train, val, test
from sklearn.model_selection import train_test_split
target = "Best3BenchKg"
features = feature_data.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test = train_test_split(feature_data[features],
                                                    feature_data[target],
                                                    test_size=0.2,
                                                    random_state=69)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=69)

In [None]:
# D train, val, test
from sklearn.model_selection import train_test_split

target = "Best3DeadliftKg"
features = feature_data.columns.tolist()
features.remove(target)
X_train, X_test, y_train, y_test = train_test_split(feature_data[features],
                                                    feature_data[target],
                                                    test_size=0.2,
                                                    random_state=69)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=69)

In [None]:
# Regression Models
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble  import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

# LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("LinearRegression mape:", mape)
print("LinearRegression mse:", mse)
print("LinearRegression R-squared:", r2)
print("LinearRegression mae:", mae)
print()

# Lasso
lasso = Lasso(alpha=0.001, random_state=69)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Lasso mape:", mape)
print("Lasso mse:", mse)
print("Lasso R-squared:", r2)
print("Lasso mae:", mae)
print()

# Ridge
ridge = Ridge(alpha=10, random_state=69)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Ridge mape:", mape)
print("Ridge mse:", mse)
print("Ridge R-squared:", r2)
print("Ridge mae:", mae)
print()

# RandomForestRegressor
rf = RandomForestRegressor(n_estimators=192, min_samples_split=6,
                               max_features='log2', max_depth=None,
                               random_state=69)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("RandomForestRegressor mape:", mape)
print("RandomForestRegressor mse:", mse)
print("RandomForestRegressor R-squared:", r2)
print("RandomForestRegressor mae:", mae)
print()

# KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=4)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("KNeighborsRegressor mape:", mape)
print("KNeighborsRegressor mse:", mse)
print("KNeighborsRegressor R-squared:", r2)
print("KNeighborsRegressor mae:", mae)
print()

# DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=69)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("DecisionTreeRegressor mape:", mape)
print("DecisionTreeRegressor mse:", mse)
print("DecisionTreeRegressor R-squared:", r2)
print("DecisionTreeRegressor mae:", mae)
print()

LinearRegression mape: 0.0714607595300791
LinearRegression mse: 414.65346625131673
LinearRegression R-squared: 0.8905164391190733
LinearRegression mae: 12.543905623119784

Lasso mape: 0.07139835972038157
Lasso mse: 414.66231206239087
Lasso R-squared: 0.8905141035039776
Lasso mae: 12.531484969072766



  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Ridge mape: 0.07143520914775621
Ridge mse: 414.505223039366
Ridge R-squared: 0.8905555807060653
Ridge mae: 12.53925239619302

RandomForestRegressor mape: 0.0698116236679638
RandomForestRegressor mse: 348.1521251054911
RandomForestRegressor R-squared: 0.9080752061970976
RandomForestRegressor mae: 12.236659869972028

KNeighborsRegressor mape: 0.10794203993131182
KNeighborsRegressor mse: 800.2968017909152
KNeighborsRegressor R-squared: 0.7886926053848987
KNeighborsRegressor mae: 18.79707838502819

DecisionTreeRegressor mape: 0.09074070179327817
DecisionTreeRegressor mse: 664.7058274056996
DecisionTreeRegressor R-squared: 0.8244935425703859
DecisionTreeRegressor mae: 16.49154567939503



In [None]:
# NN Enable TPU usage for NN
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

# TPU Detection and Connection
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  print(f'Running on a TPU w/{tpu.num_accelerators()["TPU"]} cores')
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

In [None]:
# Neural Network
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  print(f'Running on a TPU w/{tpu.num_accelerators()["TPU"]} cores')
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

with strategy.scope():
  # model
  model = tf.keras.models.Sequential([
      tf.keras.layers.Dense(9, activation='relu', input_shape=X_train.shape[1:]),
      tf.keras.layers.Dense(4, activation='tanh'),
      tf.keras.layers.Dense(1)
  ])

  model.compile(optimizer='adam',
                loss='mae',
                metrics=['mae'])

  train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(128)
  test_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(128)

  model.fit(train_dataset, epochs=25)

  model.evaluate(test_dataset)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("NN mse:", mse)
print("NN R-squared:", r2)
print("NN mae:", mae)
print()

model_results_file = '/content/drive/MyDrive/Diss/results.txt'
with open(model_results_file, 'w') as f:
  # Write the results to the file
  f.write(f"NN MAE: {mae}\n")
  f.write(f"NN MSE: {mse}\n")
  f.write(f"NN R2: {r2}\n")

Tensorflow version 2.15.0
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
NN mse: 4687.224244999773
NN R-squared: -7.76927743584288e-06
NN mae: 55.314664126620464

Results saved to: /content/drive/MyDrive/Diss/results.txt


In [None]:
# Hyperparameter Tuning with grid and random search
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# KNN
param_grid = {'n_neighbors': range(1, 10)}
knn_regressor = KNeighborsRegressor()
grid_search = GridSearchCV(estimator=knn_regressor, param_grid=param_grid,
                           scoring='r2', cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)

# Lasso
param_grid = {'alpha': np.linspace(0.001, 1, 10)}
lasso_reg = Lasso(random_state=69)
grid_search = GridSearchCV(estimator=lasso_reg, param_grid=param_grid,
                           scoring='r2', cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)

# Ridge
param_grid = {'alpha': np.linspace(0.01, 10, 10)}
ridge_reg = Ridge(random_state=69)
grid_search = GridSearchCV(estimator=ridge_reg, param_grid=param_grid,
                           scoring='r2', cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)

# RandomForestRegressor
param_grid = {
    'n_estimators': [100, 200, 500, 750],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10, 15],
    'max_features': ['sqrt', 'log2'],
}
rf_reg = RandomForestRegressor(random_state=69)
# grid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid,
#                            scoring='neg_mean_squared_error', cv=5)
grid_search = RandomizedSearchCV(estimator=rf_reg,
                                 param_distributions=param_grid,
                                 scoring='r2', cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)


Best parameters: {'n_neighbors': 4}
Best parameters: {'alpha': 0.001}


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

Best parameters: {'alpha': 10.0}


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters: {'n_estimators': 500, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': None}


In [None]:
# Neural Network Genetic Algorithm
import random
from deap import base, creator, tools, algorithms
from sklearn import neural_network
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Setup DEAP
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Regeister Hyperparameters
toolbox = base.Toolbox()
toolbox.register("learning_rate", random.uniform, 0.001, 0.1)
toolbox.register("hidden_layers", random.randint, 0, 2)
toolbox.register("neurons_per_layer", random.randint, 0, 4)

toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.learning_rate, toolbox.hidden_layers,
                  toolbox.neurons_per_layer), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Helper function for ensuring int
def translate_hidden_layers(individual):
  num_layers = individual[1] + 1
  neurons_per_layer = individual[2] * 16 + 16

  hidden_layers = tuple(int(neurons_per_layer) for _ in range(int(num_layers)))
  return hidden_layers

# mutation
def mutate_learning_rate(learning_rate, mu=0, sigma=0.2, indpb=0.1):
  if random.random() < indpb:
    mutated_value = learning_rate + random.gauss(mu, sigma)
    return max(0.001, mutated_value)
  else:
    return max(0.001, learning_rate)

# Evaluate
def evaluate_model(individual):
  hidden_layer_sizes = translate_hidden_layers(individual)
  if individual[0] < 0:
    individual[0] = 0.001
  model = neural_network.MLPRegressor(
      hidden_layer_sizes=hidden_layer_sizes,
      learning_rate_init=individual[0])
  model.fit(X_train, y_train)
  predictions = model.predict(X_val)

  # Stopping Criteria
  mae = mean_absolute_error(y_val, predictions)
  print(mae)
  if mae < 8:
    raise StopIteration("MAE target reached!")
  return mae,

# Register Functions
toolbox.register("evaluate", evaluate_model)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.2, indpb=0.1)

pop = toolbox.population(n=10)
NGEN = 20

patience = 10

for gen in range(NGEN):
    print(gen)
    offspring = algorithms.varAnd(pop, toolbox, cxpb=0.5, mutpb=0.1)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
        ind[1] = round(ind[1])
        ind[0] = mutate_learning_rate(ind[0])
    pop = toolbox.select(offspring, k=len(pop))

    # below is stopping criterion when no growth
    current_best_mae = min(fits)
    if current_best_mae < best_mae:
        best_mae = current_best_mae
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping: No improvement in", patience, "generations")
        break

best_individual = tools.selBest(pop, 1)[0]
print("Best hyperparameters:", best_individual)

model_results_file = '/content/drive/MyDrive/Diss/results.txt'
with open(model_results_file, 'w') as f:
  f.write("Best hyperparameters: " + str(best_individual) + "\n")

In [None]:
# RandomForest Genetic Algorithm
import random
from deap import base, creator, tools, algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Setup
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()

# Hyperparameter Tuning Ranges
toolbox.register("n_estimators", random.randint, 450, 550)
# toolbox.register("max_depth", random.randint, 5, 100)
toolbox.register("min_samples_split", random.randint, 3, 7)
toolbox.register("max_features", lambda: random.choice(["sqrt", "log2"]))

# Evaluation Function
def evaluate_model(individual):
  model = RandomForestRegressor(n_estimators=individual[0],
                                min_samples_split=individual[1],
                                max_features=individual[2],
                                max_depth=None,
                                random_state=69)
  model.fit(X_train, y_train)
  predictions = model.predict(X_val)
  mae = mean_absolute_error(y_val, predictions)
  return mae,

# Mutation Helper Function
def integer_mutation(value, mu=0, sigma=1):
  # Mutation
  adjustment = random.gauss(mu, sigma)
  mutated_value = int(value + adjustment)
  return mutated_value


# Mutation Function
def my_mutation(individual, toolbox, indpb):
  for i in range(len(individual)):
    if random.random() < indpb:
      if isinstance(individual[i], int):
        individual[i] = integer_mutation(individual[i], mu=0, sigma=1)
        individual[i] = max(1, individual[i])
      elif isinstance(individual[i], str):
          individual[i] = random.choice(["sqrt", "log2"])
  return individual,

# Seed population
def best_chromosomes(hyperparams, max_depth=None):
    return creator.Individual([hyperparams['n_estimators'],
                               hyperparams['min_samples_split'],
                               hyperparams['max_features'],
                               max_depth,])

# from random search (will be seeded)
best_hyperparams = {
    'n_estimators': 500,
    'min_samples_split': 5,
    'max_features': 'log2',
}

# Register variables
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.n_estimators, toolbox.min_samples_split,
                  toolbox.max_features), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

pop = toolbox.population(n=10)
# insert best params into pop
pop[0] = best_chromosomes(best_hyperparams)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", my_mutation, toolbox=toolbox, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate_model)

# Evolutionary Loop
NGEN = 50
fitness_history = []
# stopping critera variables
target_fitness = 8.0
stagnation_counter = 0
max_stagnation = 15
previous_best_fitness = float('inf')

for gen in range(NGEN):
  # Generate Offspring
  offspring = algorithms.varAnd(pop, toolbox, cxpb=0.5, mutpb=0.2)
  # Generate Fitness
  fits = toolbox.map(toolbox.evaluate, offspring)
  for fit, ind in zip(fits, offspring):
      ind.fitness.values = fit
  # Update Population
  pop = toolbox.select(offspring, k=len(pop))

  best_individual = tools.selBest(pop, 1)[0]
  best_fitness = best_individual.fitness.values[0]

  # Loading Bar
  num_steps = 20
  steps_completed = int((gen + 1) / NGEN * num_steps)
  loading_bar = "[" + "#" * steps_completed + " " * (num_steps - steps_completed) + "]"
  progress_percent = (gen + 1) / NGEN * 100
  print(f"Generation {gen + 1}/{NGEN} {loading_bar} {progress_percent:.1f}%")

  # Target Fitness Check
  current_best_fitness = best_individual.fitness.values[0]
  if current_best_fitness <= target_fitness:
    print(f"Target fitness of {target_fitness} reached!")
    break

  # Stagnation Handling (same as before)
  if current_best_fitness >= previous_best_fitness:
    stagnation_counter += 1
  else:
    stagnation_counter = 0
    previous_best_fitness = current_best_fitness

  if stagnation_counter >= max_stagnation:
    print("Early stopping due to fitness stagnation")
    break

# Best Model and Final Evaluation
best_individual = tools.selBest(pop, 1)[0]
print("Best hyperparameters:", best_individual)

In [None]:
best_individual = tools.selBest(pop, 1)[0]
print("Best hyperparameters:", best_individual)