In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV 

import sys
from contextlib import redirect_stdout


import matplotlib.pyplot as plt
import joblib

In [23]:
num_samples = 4000

In [24]:
if num_samples > 0:
    num_data = str(num_samples)
else:
    num_data = ''

In [25]:
# Read the data from the files
X_train_raw = pd.read_csv(f'dataset/X_train{num_data}.tsv', sep='\t')
X_test_raw = pd.read_csv(f'dataset/X_test{num_data}.tsv', sep='\t')
y_train = pd.read_csv(f'dataset/y_train{num_data}.tsv', sep='\t')
y_test = pd.read_csv(f'dataset/y_test{num_data}.tsv', sep='\t')

# Save the current standard output
original_stdout = sys.stdout

In [26]:
# Input size
input_size = X_train_raw.shape[1]

In [52]:
# Standardizing data
scaler = StandardScaler()
scaler.fit(X_train_raw)
X_train = scaler.transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [27]:
print(y_train.shape)
# Change the shape of the y_train to a 1d array
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()
print(y_train.shape)

(4000, 1)
(4000,)


In [28]:
X_train = X_train_raw
X_test = X_test_raw
print(X_train.shape)
print(X_test.shape)

(4000, 180)
(1001, 180)


In [19]:
model_name = f'RF{num_data}'

with open(f'reports/{model_name}.txt', 'w') as f:
    with redirect_stdout(f):

        model = RandomForestRegressor()
        model.fit(X_train,y_train)

        # Guarda el modelo
        joblib.dump(model, f'models/{model_name}.pkl')

        # Get the mean absolute error on the validation data
        y_pred = model.predict(X_test)
        MAE = metrics.mean_absolute_error(y_test , y_pred)
        MSE = metrics.mean_squared_error(y_test , y_pred)
        r2 = metrics.r2_score(y_test, y_pred)
        # Print the metrics with 4 decimal digits
        print('Model: ', model_name)
        print(f'MAE = {MAE:.4f}')
        print(f'MSE = {MSE:.4f}')
        print(f'R2 = {r2:.4f}')

# Restore the original standard output
sys.stdout = original_stdout

print("END OF THE CELL")

END OF THE CELL


In [30]:
model_name = f'RF{num_data}_GS'

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, 40],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['squared_error', 'absolute_error']
}

with open(f'reports/{model_name}.txt', 'w') as f:
    with redirect_stdout(f):
        print('Model: ', model_name)
        modelo_rf = RandomForestRegressor()

        grid_search = GridSearchCV(estimator=modelo_rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=7)
        grid_search.fit(X_train, y_train)

        print(f'Best parameters: {grid_search.best_params_}')
        print(f'Best estimator: {grid_search.best_estimator_}')

        # Save the model
        joblib.dump(grid_search, f'models/{model_name}.pkl')

        # Test the model
        y_pred = grid_search.predict(X_test)

        print('================================================')
        print('================================================')
        MAE = metrics.mean_absolute_error(y_test , y_pred)
        MSE = metrics.mean_squared_error(y_test , y_pred)
        r2 = metrics.r2_score(y_test, y_pred)
        print(f'MAE = {MAE:.4f}')
        print(f'MSE = {MSE:.4f}')
        print(f'R2 = {r2:.4f}')
        print('================================================')
        print('================================================')

# Restore the original standard output
sys.stdout = original_stdout

print("END OF THE CELL")

160 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
160 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\cgonz\tfgEnv\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\cgonz\tfgEnv\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\cgonz\tfgEnv\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\cgonz\tfgEnv\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sk

END OF THE CELL


In [None]:
model_name = f'NN{num_data}'

with open(f'reports/{model_name}.txt', 'w') as f:
    with redirect_stdout(f):

        model = keras.Sequential([
            layers.Dense(128, input_dim=input_size, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(1, activation='linear'),
        ])

        # Compile the network :
        model.compile(loss='mse', optimizer='adam', metrics=['mae', 'mse'])

        # Print the model summary
        model.summary()

        # Train the model
        history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

        hist = pd.DataFrame(history.history)
        hist['epoch'] = history.epoch
        hist.tail()

        # Save the model
        model.save(f'models/{model_name}.h5')

        # Test the model
        y_pred = model.predict(X_test)
        mse = metrics.mean_squared_error(y_test, y_pred)
        mae = metrics.mean_absolute_error(y_test, y_pred)
        r2 = metrics.r2_score(y_test, y_pred)

        print(f'MSE: {mse}')
        print(f'MAE: {mae}')
        print(f'R2: {r2}')

In [None]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MAE]')
  plt.plot(hist['epoch'], hist['mae'], label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],  label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MAE^2$]')
  plt.plot(hist['epoch'], hist['mse'], label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'], label = 'Val Error')
  plt.ylim([0,20])
  plt.legend()
  plt.show()

In [None]:
plot_history(history)

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('True Values [MAE]')
plt.ylabel('Predictions [MAE]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])