In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns

In [4]:
file_path = "../data/btc_data_features.csv"
df = pd.read_csv(file_path)

Prediccion Tendencia del Precio de BTC con Random Forest

In [5]:
# Eliminamos filas con valores NaN generados por indicadores técnicos
df.dropna(inplace=True)

# Seleccionamos características y variable objetivo
features = ["SMA_20", "SMA_50", "RSI_14", "BB_Upper", "BB_Lower", "volume"]
X = df[features]
y = df["tendencia"]

In [6]:
# Dividimos datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [41]:
#Entrenamiento de modelo

In [42]:
# model = RandomForestClassifier(n_estimators=100, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [43]:
# Definir los hiperparámetros a optimizar
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

In [44]:
# Realizamos la búsqueda de hiperparámetros
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [45]:
# Mejor modelo
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

In [46]:
# Evaluación
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Mejores hiperparámetros: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.7828947368421053
              precision    recall  f1-score   support

           0       0.78      0.74      0.76        70
           1       0.79      0.82      0.80        82

    accuracy                           0.78       152
   macro avg       0.78      0.78      0.78       152
weighted avg       0.78      0.78      0.78       152



Modelo con redes neuronales 

In [47]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

Redes LTSM

In [48]:
# Seleccion características y variable objetivo
features = ["open", "high", "low", "close", "SMA_20", "SMA_50", "RSI_14", "BB_Upper", "BB_Lower", "volume"]
X = df[features]
y = df["tendencia"]

# Normalizar los datos
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# import joblib

# # Guardar el scaler
# joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [49]:

# Reestructuracion de los datos para LSTM (samples, timesteps, features)
time_steps = 20  # Usaremos 20 días de datos pasados para predecir
X_lstm = []
y_lstm = []
y = np.array(y)  # Asegurar que y es un array de NumPy
for i in range(len(X) - time_steps):
    X_lstm.append(X[i : i + time_steps])
    y_lstm.append(y[i + time_steps])
X_lstm, y_lstm = np.array(X_lstm), np.array(y_lstm)



In [50]:
# Dividimos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42, stratify=y_lstm)

In [51]:
# Definicion el modelo LSTM
model = tf.keras.Sequential([
    Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, input_shape=(time_steps, X_train.shape[2]))),
    tf.keras.layers.Dropout(0.3),
    Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.LSTM(32, return_sequences=False),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(**kwargs)


In [52]:
# Compilacion del modelo
model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])

In [53]:
# Entrenamos el modelo
model.fit(X_train, y_train, epochs=500, batch_size=64, validation_data=(X_test, y_test), verbose=1)

Epoch 1/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 138ms/step - accuracy: 0.5362 - loss: 0.6971 - val_accuracy: 0.5743 - val_loss: 0.6648
Epoch 2/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.5093 - loss: 0.6933 - val_accuracy: 0.6622 - val_loss: 0.6586
Epoch 3/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - accuracy: 0.5700 - loss: 0.6827 - val_accuracy: 0.6081 - val_loss: 0.6596
Epoch 4/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - accuracy: 0.5818 - loss: 0.6685 - val_accuracy: 0.6824 - val_loss: 0.6410
Epoch 5/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - accuracy: 0.5804 - loss: 0.6700 - val_accuracy: 0.6689 - val_loss: 0.6318
Epoch 6/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - accuracy: 0.6307 - loss: 0.6453 - val_accuracy: 0.6419 - val_loss: 0.6305
Epoch 7/500
[1m10/10[0m 

<keras.src.callbacks.history.History at 0x1ef93bff590>

In [54]:
# Evaluacion del modelo
y_pred = (model.predict(X_test) > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 168ms/step
Accuracy: 0.8108108108108109
              precision    recall  f1-score   support

           0       0.83      0.74      0.78        68
           1       0.80      0.88      0.83        80

    accuracy                           0.81       148
   macro avg       0.81      0.81      0.81       148
weighted avg       0.81      0.81      0.81       148



Entre el modelo random forest y las redes neuronales LSTM nos decantamos por la ultima ya que arroja un accuracy mejor, el cual es 0.81

In [None]:
# Guardamos el modelo entrenado en formato .h5
model.save("../api/btc_lstm_model.h5")

