# **Equipo C - 2024 - 1**

# **MODELO ENSAMBLADO**

# Preprocesamiento de Datos

In [1]:
# Importamos las librerías
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_regression
from sklearn.svm import SVR
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import plotly.graph_objects as go

In [2]:
# Limpieza de Datos
def clean_data(data):
    data = data.dropna()
    return data

In [3]:
# Normalización de las Variables
def normalize_data(data):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)
    return pd.DataFrame(data_scaled, columns=data.columns), scaler

In [4]:
# Selección de Variables utilizando diferentes métodos
def select_features(X, y, num_features):
    mutual_info = mutual_info_regression(X, y)
    k_best = SelectKBest(score_func=f_regression, k=num_features).fit(X, y)
    features = X.columns[k_best.get_support(indices=True)]
    return features.tolist()

In [5]:
# Cargar datos
data = pd.read_csv('https://query1.finance.yahoo.com/v7/finance/download/FSM?period1=1597123200&period2=1628659200&interval=1d&events=history&includeAdjustedClose=true')

In [6]:
# Mantener la columna de fechas para las gráficas
dates = data['Date']
data = data.drop(columns=['Date'])

In [7]:
# Limpiar y Normalizar
data = clean_data(data)
data, scaler = normalize_data(data)

In [8]:
# Seleccionar Variables
target_column = 'Close'
num_features = 5  # Número de características a seleccionar
selected_features = select_features(data.drop(columns=[target_column]), data[target_column], num_features)
selected_features.append(target_column)
data = data[selected_features]

In [9]:
# Separar características y objetivo
X = data.drop(columns=[target_column])
y = data[target_column]

In [10]:
# Dividir los datos en conjuntos de entrenamiento y prueba
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
dates_train, dates_test = dates[:train_size], dates[train_size:]

In [11]:
print(f'Características seleccionadas: {selected_features}')

Características seleccionadas: ['Open', 'High', 'Low', 'Adj Close', 'Volume', 'Close']


# Entrenamiento y Validación Corregido

In [12]:
# Optimización del Modelo SVM
def optimize_svm(X_train, y_train):
    param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
    grid = GridSearchCV(SVR(), param_grid, refit=True, verbose=3, cv=TimeSeriesSplit(n_splits=5))
    grid.fit(X_train, y_train)
    return grid.best_estimator_

In [13]:
# Entrenamiento del Modelo LSTM
def train_lstm(X_train, y_train, input_shape):
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.2)
    return model

In [14]:
# Preparar datos para LSTM
X_train_lstm = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [15]:
# Optimizar y Entrenar Modelos
svm_model = optimize_svm(X_train, y_train)
lstm_model = train_lstm(X_train_lstm, y_train, (1, X_train.shape[1]))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.527 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.509 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.657 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.851 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.708 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.119 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.233 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.567 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.622 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.445 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.125 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.01, kernel=rbf;

  super().__init__(**kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step - loss: 0.3064 - val_loss: 0.2087
Epoch 2/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2626 - val_loss: 0.1756
Epoch 3/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2232 - val_loss: 0.1414
Epoch 4/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.1811 - val_loss: 0.1061
Epoch 5/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.1332 - val_loss: 0.0707
Epoch 6/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0887 - val_loss: 0.0383
Epoch 7/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0500 - val_loss: 0.0143
Epoch 8/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0169 - val_loss: 0.0041
Epoch 9/500
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6

In [16]:
# Predicciones
svm_predictions = pd.Series(svm_model.predict(X_test), index=X_test.index)
lstm_predictions = pd.Series(lstm_model.predict(X_test_lstm).flatten(), index=X_test.index)
combined_predictions = pd.Series(np.median([svm_predictions, lstm_predictions], axis=0), index=X_test.index)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step


In [17]:
# Métricas de Validación
mape_svm = mean_absolute_percentage_error(y_test, svm_predictions)
mape_lstm = mean_absolute_percentage_error(y_test, lstm_predictions)
mape_combined = mean_absolute_percentage_error(y_test, combined_predictions)

In [18]:
print(f'MAPE SVM: {mape_svm}')
print(f'MAPE LSTM: {mape_lstm}')
print(f'MAPE Combined: {mape_combined}')

MAPE SVM: 25332400566781.773
MAPE LSTM: 14940555608707.0
MAPE Combined: 20136478087744.387


In [19]:
rmse_svm = np.sqrt(mean_squared_error(y_test, svm_predictions))
rmse_lstm = np.sqrt(mean_squared_error(y_test, lstm_predictions))
rmse_combined = np.sqrt(mean_squared_error(y_test, combined_predictions))

In [20]:
print(f'RMSE SVM: {rmse_svm}')
print(f'RMSE LSTM: {rmse_lstm}')
print(f'RMSE Combined: {rmse_combined}')

RMSE SVM: 0.09621961941189958
RMSE LSTM: 0.04265634133302953
RMSE Combined: 0.0686496109395059


In [21]:
# Visualización con Plotly
def plot_forecast(dates_test, y_test, svm_predictions, lstm_predictions, combined_predictions):
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=dates_test, y=y_test, mode='lines', name='Actual Price'))
    fig.add_trace(go.Scatter(x=dates_test, y=svm_predictions, mode='lines', name='SVM Predictions'))
    fig.add_trace(go.Scatter(x=dates_test, y=lstm_predictions, mode='lines', name='LSTM Predictions'))
    fig.add_trace(go.Scatter(x=dates_test, y=combined_predictions, mode='lines', name='Combined Predictions'))

    fig.update_layout(title='Stock Price Prediction',
                      xaxis_title='Date',
                      yaxis_title='Normalized Price',
                      template='plotly_dark')

    fig.show()

In [22]:
plot_forecast(dates_test, y_test, svm_predictions, lstm_predictions, combined_predictions)