In [None]:
from IPython.display import Image
from IPython.display import display, HTML
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from fboost import outlier_iqr, DataPreparator, FeatureBoosterRegressor
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


import warnings
warnings.filterwarnings('ignore')

# Set a seed value
seed_value= 0
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

In [49]:
data = pd.read_csv('../data/raw/alldata.csv')
data.columns = ['familia','indice_fam','subposicion_1','subposicion_2','subposicion_3','subposicion_4', 'energia']
mask = (data[['subposicion_1','subposicion_2','subposicion_3','subposicion_4']] == 0).sum(axis=1) > 1
lista_train = data.loc[mask,['subposicion_1','subposicion_2','subposicion_3','subposicion_4','energia']]
lista_test = data.loc[~mask,['subposicion_1','subposicion_2','subposicion_3','subposicion_4','energia']]

In [53]:
# Instantiate the OneHotEncoder
encoder = OneHotEncoder(sparse=False, categories='auto')

X_train = lista_train.drop(['energia'], axis=1)
X_test = lista_test.drop(['energia'], axis=1)

# Fit the encoder and transform the data for both train and test dataframes
encoded_train = encoder.fit_transform(X_train)
encoded_test = encoder.transform(X_test)

# Now, 'encoded_train' and 'encoded_test' are numpy arrays, we can convert them back to dataframes:
X_train_scaled = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(X_train.columns))
X_test_scaled = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(X_test.columns))


y_test = lista_test['energia']
y_train = lista_train['energia']

In [54]:
data_prep = DataPreparator(outliers_strategy = 'IQR', 
                           outliers_cutoff = 3, 
                           encoding_strategy = 'dummy',
                           drop_duplicate_rows = True)
X_train, y_train = data_prep.fit_transform(X_train, y_train)
X_test = data_prep.transform(X_test, y_test)

In [55]:


# FEATURE ENGINEERING - 1/2 - Polynomials + Rules extraction
fboost = FeatureBoosterRegressor(base_model = RandomForestRegressor(criterion='friedman_mse', 
                                                    max_depth=5,
                                                    max_features=None, 
                                                    max_leaf_nodes=2,
                                                    min_samples_leaf=1, 
                                                    verbose=0, 
                                                    n_estimators = 850,
                                                    warm_start=True,
                                                    random_state = 0),
                                max_rules = 2800, 
                                n_best_rules = 35,
                                original_features_selection= False,
                                selection_strategy = 'severe',  
                                quantile_cutoff = 0.83,
                                alpha = 89,
                                scaler = 'Standard',
                                random_state = 0)

# FIT FEATURE ENGINEERING FOR TRAIN DATA
X_train, rules = fboost.fit_transform(X_train, y_train)

# TRANSFORM FEATURE ENGINEERING FOR TEST DATA
X_test = fboost.transform(X_test)

#LET HAVE A LOOK AT THE NEW FEATURES WE JUST CREATED
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 0 to 1269
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   subposicion_1    170 non-null    int64  
 1   subposicion_2    170 non-null    int64  
 2   subposicion_3    170 non-null    int64  
 3   subposicion_4    170 non-null    int64  
 4   subposicion_12   170 non-null    float64
 5   subposicion_42   170 non-null    float64
 6   RULE_EXTRACT_19  170 non-null    float64
 7   RULE_EXTRACT_8   170 non-null    float64
 8   RULE_EXTRACT_26  170 non-null    float64
 9   RULE_EXTRACT_10  170 non-null    float64
 10  RULE_EXTRACT_20  170 non-null    float64
 11  RULE_EXTRACT_27  170 non-null    float64
dtypes: float64(8), int64(4)
memory usage: 17.3 KB


In [71]:
import keras_tuner as kt
from keras_tuner import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters
from sklearn.utils import shuffle
from tensorflow.keras.layers import Input, Flatten, Dense, Layer, BatchNormalization, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import ElasticNet

X_train_array = np.array(X_train)
y_train_array = np.array(y_train)
X_test_array = np.array(X_test)

#train_data = X_train_array.reshape((X_train_array.shape[0], 12))
X_train_array.shape

(170, 12)

In [78]:
train_data, y_data = shuffle(X_train_array, y_train_array)


input_dim = 12 # Please replace with your input dimension
output_dim = 1 # Please replace with your output dimension

# Define Multiply Network
input_layer_1 = Input(shape=(input_dim,))
input_layer_2 = Input(shape=(input_dim,))

dense_1 = Dense(12000, activation='selu')(input_layer_1)
dense_2 = Dense(12000, activation='selu')(input_layer_2)

multiply_layer_1 = Multiply()([dense_1, dense_2])



output_layer = Dense(output_dim, activation='linear')(multiply_layer_1)

multiply_model = Model(inputs=[input_layer_1, input_layer_2], outputs=output_layer)

multiply_model.compile(optimizer=Adam(), loss='mean_squared_error')

multiply_model.fit([X_train_array, X_train_array], y_train_array, epochs=250, validation_split=0.2)



Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<keras.callbacks.History at 0x2272672e560>

In [79]:
#onehot_df_test_encoded, H_test_encoded = prepare_test_data(onehot_df_test, H_test)  # You need to ensure that your test data is prepared in the same way as your training data

# Now you can make predictions on your test set
predictions = multiply_model.predict([X_test_array,X_test_array])




 1/35 [..............................] - ETA: 1s



In [80]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print('Mean Squared Error (MSE): ', mse)
print('Root Mean Squared Error (RMSE): ', rmse)
print('Mean Absolute Error (MAE): ', mae)
print('R-squared Score (R^2): ', r2)

Mean Squared Error (MSE):  22.8604938673812
Root Mean Squared Error (RMSE):  4.781264881533045
Mean Absolute Error (MAE):  3.216296166721244
R-squared Score (R^2):  -0.1083833386763291


In [None]:
import matplotlib.pyplot as plt

# plot predicted vs real values
plt.figure(figsize=(10, 5))
plt.plot(y_test, label='Real')
plt.plot(predictions, label='Predicted')
plt.legend()
plt.show()

In [None]:

differences = predictions.flatten() - y_test.flatten() # This will give the difference between your predictions and the actual values

#Create a figure with two subplots: a histogram of the differences and a scatter plot of predicted vs real values
fig, axs = plt.subplots(nrows=2, figsize=(10, 15))

#Plot histogram
axs[0].hist(differences, bins=20, density=True)
axs[0].set_title('Histogram of differences between predicted and actual values')
axs[0].set_xlabel('Differences')
axs[0].set_ylabel('Density')

#Plot scatter
axs[1].scatter(y_test, predictions, alpha=0.5)
axs[1].set_title('Scatter plot of predicted vs actual values')
axs[1].set_xlabel('Actual Values')
axs[1].set_ylabel('Predicted Values')

#Draw a diagonal line on the scatterplot
lims = [np.min([axs[1].get_xlim(), axs[1].get_ylim()]), # min of both axes
np.max([axs[1].get_xlim(), axs[1].get_ylim()])] # max of both axes
axs[1].plot(lims, lims, 'k-', alpha=0.75, zorder=0)
axs[1].set_xlim(lims)
axs[1].set_ylim(lims)

plt.tight_layout()
plt.show()

In [None]:
# Define Multiply Simple Network
input_layer_1_simple = Input(shape=(input_dim,))
input_layer_2_simple = Input(shape=(input_dim,))

multiply_layer_simple = Multiply()([input_layer_1_simple, input_layer_2_simple])

output_layer_simple = Dense(output_dim, activation='linear')(multiply_layer_simple)

multiply_simple_model = Model(inputs=[input_layer_1_simple, input_layer_2_simple], outputs=output_layer_simple)

multiply_simple_model.compile(optimizer=Adam(), loss='mean_squared_error')

multiply_simple_model.fit([train_data, train_data], y_train, epochs=150, validation_split=0.2)

In [None]:
#onehot_df_test_encoded, H_test_encoded = prepare_test_data(onehot_df_test, H_test)  # You need to ensure that your test data is prepared in the same way as your training data
test_data = np.concatenate((onehot_df_test_encoded, H_test_encoded), axis = 1)
# Now you can make predictions on your test set
predictions = multiply_simple_model.predict([test_data,test_data])

In [None]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print('Mean Squared Error (MSE): ', mse)
print('Root Mean Squared Error (RMSE): ', rmse)
print('Mean Absolute Error (MAE): ', mae)
print('R-squared Score (R^2): ', r2)

In [None]:
# plot predicted vs real values
plt.figure(figsize=(10, 5))
plt.plot(y_test, label='Real')
plt.plot(predictions, label='Predicted')
plt.legend()
plt.show()

In [None]:

differences = predictions.flatten() - y_test.flatten() # This will give the difference between your predictions and the actual values

#Create a figure with two subplots: a histogram of the differences and a scatter plot of predicted vs real values
fig, axs = plt.subplots(nrows=2, figsize=(10, 15))

#Plot histogram
axs[0].hist(differences, bins=20, density=True)
axs[0].set_title('Histogram of differences between predicted and actual values')
axs[0].set_xlabel('Differences')
axs[0].set_ylabel('Density')

#Plot scatter
axs[1].scatter(y_test, predictions, alpha=0.5)
axs[1].set_title('Scatter plot of predicted vs actual values')
axs[1].set_xlabel('Actual Values')
axs[1].set_ylabel('Predicted Values')

#Draw a diagonal line on the scatterplot
lims = [np.min([axs[1].get_xlim(), axs[1].get_ylim()]), # min of both axes
np.max([axs[1].get_xlim(), axs[1].get_ylim()])] # max of both axes
axs[1].plot(lims, lims, 'k-', alpha=0.75, zorder=0)
axs[1].set_xlim(lims)
axs[1].set_ylim(lims)

plt.tight_layout()
plt.show()