In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Cargar los datos
data = pd.read_csv("../Cluster0ReadyToNN.csv", sep=";")

In [4]:
# Crear un diccionario para almacenar los objetos scaler por grupo
scalers = {}

# Iterar sobre los grupos únicos en Column15
for group in data['Column15'].unique():
    # Filtrar datos por grupo
    group_data = data[data['Column15'] == group]

    # Seleccionar las columnas para normalización (las 13 primeras)
    features = group_data.iloc[:, :13]

    # Normalizar los datos con MinMaxScaler
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(features)

    # Almacenar el scaler en el diccionario
    scalers[group] = scaler

    # Actualizar el DataFrame con los datos normalizados
    data.loc[data['Column15'] == group, 'Column1':'Column13'] = normalized_data

In [5]:
data

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14,Column15
0,0.317757,0.163551,1.000000,0.957447,1.000000,0.750000,0.979167,0.708333,0.979167,1.000000,0.641026,0.897436,1.000000,2002/3,23
1,0.163551,1.000000,0.210280,1.000000,0.744681,0.979167,0.708333,0.979167,1.000000,0.520833,0.897436,1.000000,0.871795,2002/4,23
2,1.000000,0.210280,0.219626,0.744681,0.978723,0.708333,0.979167,1.000000,0.520833,0.729167,1.000000,0.871795,0.846154,2003/1,23
3,0.210280,0.219626,0.163551,0.978723,0.702128,0.979167,1.000000,0.520833,0.729167,0.812500,0.871795,0.846154,0.717949,2003/2,23
4,0.219626,0.163551,0.214953,0.702128,0.978723,1.000000,0.520833,0.729167,0.812500,0.708333,0.846154,0.717949,0.846154,2003/3,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170581,0.017857,0.023810,0.047619,0.023810,0.029762,0.029762,0.029762,0.017857,0.041667,0.023810,0.017857,0.023810,0.011905,2014/4,81579
170582,0.023810,0.047619,0.023810,0.029762,0.029762,0.029762,0.017857,0.041667,0.023810,0.017857,0.023810,0.011905,0.023810,2015/1,81579
170583,0.047619,0.023810,0.029762,0.029762,0.029762,0.017857,0.041667,0.023810,0.017857,0.023810,0.011905,0.023810,0.023810,2015/2,81579
170584,0.023810,0.029762,0.029762,0.029762,0.017857,0.041667,0.023810,0.017857,0.023810,0.011905,0.023810,0.023810,0.029762,2015/3,81579


In [6]:
# Ordenar el DataFrame por 'Column 14' de forma ascendente
data = data.sort_values(by='Column14')

# Dividir los datos en entrenamiento (70%) y temporal (30%)
train_temp_data, test_data = train_test_split(data, test_size=0.3, stratify=data['Column15'], random_state=0)
#train_temp_data, test_data = train_test_split(data, test_size=0.3, shuffle=False, random_state=0)

# Dividir el temporal en entrenamiento (70%) y validación (30%)
train_data, validation_data = train_test_split(train_temp_data, test_size=0.3, stratify=train_temp_data['Column15'], random_state=0)
#train_data, validation_data = train_test_split(train_temp_data, test_size=0.3, shuffle=False, random_state=0)

# Separar características (X) y columna objetivo (y)
X_train = train_data.iloc[:, :12]
y_train = train_data['Column13']
X_val = validation_data.iloc[:, :12]
y_val = validation_data['Column13']
X_test = test_data.iloc[:, :12]
y_test = test_data['Column13']


# Reshape de los datos para GRU (número de muestras, número de pasos de tiempo, número de características)
n_samples_train, n_features = X_train.shape
n_samples_val = X_val.shape[0]
n_timesteps = 1
X_train = X_train.values.reshape(n_samples_train, n_timesteps, n_features)
X_val = X_val.values.reshape(n_samples_val, n_timesteps, n_features)
X_test = X_test.values.reshape(X_test.shape[0], n_timesteps, n_features)

print(n_samples_train, n_samples_val, n_features)

83587 35823 12


In [7]:
# Convertir X_test a formato compatible para modelos convencionales
X_test_flat = X_test.reshape(X_test.shape[0], n_features)
X_train_flat = X_train.reshape(X_train.shape[0], n_features)
X_val_flat = X_val.reshape(X_val.shape[0], n_features)

In [8]:
# 3. Modelo de Random Forest (cambiar a 100)
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)
rf_model.fit(X_train_flat, y_train)
y_pred = rf_model.predict(X_test_flat)

In [10]:
# Crear un DataFrame con las predicciones desnormalizadas y los valores reales
resultados = pd.DataFrame({'Valor Real': y_test.values.flatten(), 'Predicciones': y_pred.flatten()})
print(resultados)

# Agregar la columna de predicciones al conjunto de prueba
test_data['Predicted_Column13'] = y_pred.flatten()

# Crear un DataFrame para almacenar los resultados desnormalizados
desnormalized_test_data = test_data.copy()

# Desnormalizar 'Column1' a 'Column13' y 'Predicted_Column13' según la normalización por grupos
for group, scalerY in scalers.items():
    # Filtrar el conjunto de prueba correspondiente al grupo
    group_test_data = test_data[test_data['Column15'] == group]

    # Seleccionar las columnas normalizadas para desnormalizar
    normalized_features = group_test_data[['Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6', 'Column7', 'Column8', 'Column9', 'Column10', 'Column11', 'Column13', 'Predicted_Column13']]

    # Desnormalizar los datos utilizando el objeto scalerY correspondiente
    original_data = scalerY.inverse_transform(normalized_features)

    # Crear un DataFrame temporal para almacenar los datos desnormalizados
    temp_df = pd.DataFrame(original_data, columns=['Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6', 'Column7', 'Column8', 'Column9', 'Column10', 'Column11', 'Column13', 'Predicted_Column13'])

    # Actualizar el DataFrame desnormalizado con los datos desnormalizados
    desnormalized_test_data.loc[desnormalized_test_data['Column15'] == group, ['Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6', 'Column7', 'Column8', 'Column9', 'Column10', 'Column11', 'Column13', 'Predicted_Column13']] = temp_df.values

# Imprimir el conjunto de prueba después de la desnormalización
print(desnormalized_test_data)

# Eliminar todas las columnas excepto las últimas cuatro
resultados = desnormalized_test_data.iloc[:, -4:]

# Imprimir el conjunto de prueba después de la eliminación de columnas
print(resultados)

# Guardar el DataFrame resultados en un archivo CSV
resultados.to_csv('RF_norm.csv', index=False)

       Valor Real  Predicciones
0        0.128378      0.286216
1        0.347826      0.302754
2        0.051724      0.136529
3        0.295400      0.259615
4        0.155556      0.140714
...           ...           ...
51171    0.321429      0.141612
51172    0.011936      0.059358
51173    0.053030      0.242313
51174    0.312155      0.347450
51175    0.600000      0.544825

[51176 rows x 2 columns]
        Column1  Column2  Column3  ...  Column14  Column15  Predicted_Column13
24905      58.0     66.0     62.0  ...    2005/2      8751           64.359990
155320    101.0     39.0     24.0  ...    2006/3     56704           51.743373
8745       30.0     30.0     29.0  ...    2015/2      3059           27.918659
60462     193.0     59.0     48.0  ...    2011/3     20496          135.220982
4241       35.0     29.0     37.0  ...    2009/4      1242           31.996362
...         ...      ...      ...  ...       ...       ...                 ...
132964     37.0     35.0     33.0  ..

In [13]:
# Obtener y_test de la primera columna de resultados
y_test = resultados['Column13']

# Obtener y_pred de la última columna del conjunto de prueba después de la desnormalización
y_pred = desnormalized_test_data['Predicted_Column13']

# Imprimir y_test_norm y y_pred
print("y_test:", y_test)
print("y_pred:", y_pred)

# Imprimir el DataFrame
print(resultados)

# Calcular RMSE con datos desnormalizados
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE en el conjunto de prueba: {rmse}')

# Calcular MAE con datos desnormalizados
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE en el conjunto de prueba: {mae}')


#Calcular MAPE con datos desnormalizados
print("Mean absolute percentage error (MAPE): %f" % mean_absolute_percentage_error(y_test, y_pred))

y_test: 24905      41.0
155320     59.0
8745       23.0
60462     150.0
4241       34.0
          ...  
132964     65.5
130033     10.0
124375     13.0
50855     145.0
5442       85.0
Name: Column13, Length: 51176, dtype: float64
y_pred: 24905      64.359990
155320     51.743373
8745       27.918659
60462     135.220982
4241       31.996362
             ...    
132964     19.965144
130033     45.755621
124375     37.985305
50855     157.776929
5442       78.930704
Name: Predicted_Column13, Length: 51176, dtype: float64
        Column13 Column14  Column15  Predicted_Column13   Año
24905       41.0   2005/2      8751           64.359990  2005
155320      59.0   2006/3     56704           51.743373  2006
8745        23.0   2015/2      3059           27.918659  2015
60462      150.0   2011/3     20496          135.220982  2011
4241        34.0   2009/4      1242           31.996362  2009
...          ...      ...       ...                 ...   ...
132964      65.5   2006/3     46569      

In [14]:
# Convertir la columna 'Column14' para extraer el año
resultados['Año'] = resultados['Column14'].str.split('/').str[0].astype(int)

# Agrupar por año y calcular la suma de reales y predicciones
suma_anual = resultados.groupby('Año').agg({
    'Column13': 'sum',  # Suma de valores reales
    'Predicted_Column13': 'sum'  # Suma de predicciones
}).reset_index()

# Agregar una columna de MAPE por fila
suma_anual['MAPE_fila'] = (
    (abs(suma_anual['Column13'] - suma_anual['Predicted_Column13']) / suma_anual['Column13']) * 100
)

# Calcular la media de la columna MAPE
media_mape = suma_anual['MAPE_fila'].mean()

# Mostrar el DataFrame actualizado y la media
print(suma_anual)
print(f"Media de MAPE (por fila): {media_mape:.2f}%")

     Año       Column13  Predicted_Column13  MAPE_fila
0   2002  104265.678499       101945.325361   2.225424
1   2003  177491.021691       169555.493758   4.470946
2   2004  164849.403889       167229.529217   1.443818
3   2005  159059.300841       161731.790701   1.680185
4   2006  131381.875214       147401.301614  12.193026
5   2007  131223.654627       141705.449997   7.987733
6   2008  122996.287059       139852.845312  13.704933
7   2009  128977.774184       134388.332356   4.194954
8   2010  117483.995234       129267.617701  10.029981
9   2011  118736.260722       133501.598650  12.435408
10  2012  122697.561296       138659.676133  13.009317
11  2013  109818.900063       128769.987875  17.256672
12  2014  104937.636563       125031.158762  19.148061
13  2015  109243.945117       126577.969801  15.867264
Media de MAPE (por fila): 9.69%
