In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

In [83]:
df = pd.read_csv('../data/TSLA.csv')

In [84]:
df.head(15)

Unnamed: 0,Date,Close,High,Low,Open,Volume,r1,r2,r3,r4,...,r10,r11,r13,r14,r15,RSI,Momentum,TrueRange,ATR,ParabolicSAR
0,2020-01-02,28.684,28.713333,28.114,28.299999,142981500,,,,,...,,-0.006594,,,,,,0.599333,,28.684
1,2020-01-03,29.534,30.266666,29.128,29.366667,266677500,0.029203,,,,...,,-0.00816,-0.006594,,,,,1.582666,,29.534
2,2020-01-06,30.102667,30.104,29.333332,29.364668,151995000,0.019072,0.029203,,,...,0.0145,-0.001068,-0.00816,-0.006594,,,,0.770668,,30.102667
3,2020-01-07,31.270666,31.441999,30.224001,30.76,268231500,0.038067,0.019072,0.029203,,...,0.030187,-0.017579,-0.001068,-0.00816,-0.006594,,,1.339333,,31.270666
4,2020-01-08,32.809334,33.232666,31.215334,31.58,467164500,0.048033,0.038067,0.019072,0.029203,...,0.024866,-0.011615,-0.017579,-0.001068,-0.00816,,,2.017332,,32.809334
5,2020-01-09,32.089333,33.253334,31.524668,33.139999,426606000,-0.022189,0.048033,0.038067,0.019072,...,0.021929,-0.049971,-0.011615,-0.017579,-0.001068,,,1.728666,,32.089333
6,2020-01-10,31.876667,32.329334,31.58,32.119331,194392500,-0.006649,-0.022189,0.048033,0.038067,...,0.051009,-0.016934,-0.049971,-0.011615,-0.017579,,,0.749334,,31.876667
7,2020-01-13,34.990665,35.042,32.799999,32.900002,397764000,0.093207,-0.006649,-0.022189,0.048033,...,0.003414,-0.003044,-0.016934,-0.049971,-0.011615,,,3.165333,,34.990665
8,2020-01-14,35.861332,36.493999,34.993332,36.284,434943000,0.024578,0.093207,-0.006649,-0.022189,...,0.006517,-0.036219,-0.003044,-0.016934,-0.049971,,,1.503334,,35.861332
9,2020-01-15,34.566666,35.855999,34.452667,35.317333,260532000,-0.03677,0.024578,0.093207,-0.006649,...,0.063075,-0.024787,-0.036219,-0.003044,-0.016934,,,1.408665,,34.566666


In [85]:
df.isna().sum()

Date             0
Close            0
High             0
Low              0
Open             0
Volume           0
r1               1
r2               2
r3               3
r4               4
r5               0
r6               1
r7               2
r8               3
r9               1
r10              2
r11              0
r13              1
r14              2
r15              3
RSI             13
Momentum        10
TrueRange        0
ATR             13
ParabolicSAR     0
dtype: int64

In [136]:
def prepare_data(stock_data, sequence_length=10, target_column='Close', train_size=325, test_size=125, step=125):
    """
    Chuẩn bị dữ liệu cho mô hình LSTM và XGBoost, bao gồm chuẩn hóa và dịch chuyển tập huấn luyện, kiểm tra.
    
    Parameters:
    -----------
    stock_data : pandas DataFrame
        Dữ liệu cổ phiếu đã được xử lý
    sequence_length : int
        Độ dài chuỗi thời gian cho LSTM
    target_column : str
        Tên cột dữ liệu mục tiêu cần dự đoán
    train_size : int
        Số lượng mẫu dành cho tập huấn luyện
    test_size : int
        Số lượng mẫu dành cho tập kiểm tra
    step : int
        Số lượng mẫu dịch chuyển giữa các lần huấn luyện
        
    Returns:
    --------
    List chứa các bộ dữ liệu huấn luyện và kiểm tra đã xử lý cho LSTM và XGBoost
    """
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    stock_data = stock_data.sort_values('Date').reset_index(drop=True)
    datasets = []
    
    for start_idx in range(0, len(stock_data) - (train_size + test_size), step):
        df_train = stock_data.iloc[start_idx:start_idx + train_size]
        df_test = stock_data.iloc[start_idx + train_size - sequence_length:start_idx + train_size + test_size]
        
        feature_columns = [col for col in stock_data.columns if col not in ['Date', target_column]]
        
        X_train = df_train[feature_columns].values
        X_test = df_test[feature_columns].values
        y_train = df_train[target_column].values
        y_test = df_test[target_column].values
        
        # Chuẩn hóa dữ liệu
        scaler_X = MinMaxScaler(feature_range=(0, 1))
        scaler_y = MinMaxScaler(feature_range=(0, 1))
        
        X_train_scaled = scaler_X.fit_transform(X_train)
        X_test_scaled = scaler_X.transform(X_test)
        
        y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
        y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))
        
        # Chuẩn bị dữ liệu cho XGBoost
        X_train_xgb = X_train_scaled
        X_test_xgb = X_test_scaled
        y_train_xgb = y_train_scaled.flatten()
        y_test_xgb = y_test_scaled.flatten()
        
        # Chuẩn bị dữ liệu cho LSTM
        X_sequences, y_sequences = [], []
        for i in range(len(X_train_scaled) - sequence_length):
            X_sequences.append(X_train_scaled[i:i + sequence_length])
            y_sequences.append(y_train_scaled[i + sequence_length])
        
        X_sequences = np.array(X_sequences)
        y_sequences = np.array(y_sequences)
        
        X_test_sequences, y_test_sequences = [], []
        for i in range(len(X_test_scaled) - sequence_length):
            X_test_sequences.append(X_test_scaled[i:i + sequence_length])
            y_test_sequences.append(y_test_scaled[i + sequence_length])
        
        X_test_sequences = np.array(X_test_sequences)
        y_test_sequences = np.array(y_test_sequences)
        
        datasets.append({
            'xgboost': {
                'X_train': X_train_xgb, 
                'y_train': y_train_xgb,
                'X_test': X_test_xgb[sequence_length:], 
                'y_test': y_test_xgb[sequence_length:],
                'feature_names': feature_columns,
                'scaler_X': scaler_X,
                'scaler_y': scaler_y
            },
            'lstm': {
                'X_train': X_sequences, 
                'y_train': y_sequences,
                'X_test': X_test_sequences, 
                'y_test': y_test_sequences,
                'scaler_y': scaler_y,
                'scaler_X': scaler_X,
                'sequence_length': sequence_length
            },
            'dates_test': df_test['Date'].values[sequence_length:],
            'actual_test': y_test[sequence_length:]
        })
    
    return datasets


In [137]:
def build_lstm_model(prepared_data, units=50, dropout_rate=0.2, learning_rate=0.001, epochs=100, batch_size=32, patience=10):
    """
    Xây dựng và huấn luyện mô hình LSTM
    
    Parameters:
    -----------
    prepared_data : dict
        Dictionary chứa dữ liệu đã chuẩn bị cho LSTM
    units : int
        Số lượng units trong layer LSTM
    dropout_rate : float
        Tỷ lệ dropout để tránh overfitting
    learning_rate : float
        Tốc độ học của optimizer
    epochs : int
        Số lượng epochs huấn luyện
    batch_size : int
        Kích thước batch
    patience : int
        Số epochs đợi trước khi early stopping
        
    Returns:
    --------
    Mô hình LSTM đã huấn luyện và kết quả dự đoán
    """

    X_train = prepared_data['lstm']['X_train']
    y_train = prepared_data['lstm']['y_train']
    X_test = prepared_data['lstm']['X_test']
    scaler_y = prepared_data['lstm']['scaler_y']
    

    input_shape = (X_train.shape[1], X_train.shape[2])
    
    model = Sequential()
    model.add(LSTM(units=units, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=units, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')

    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.1,
        callbacks=[early_stopping],
        verbose=1
    )

    y_pred_scaled = model.predict(X_test)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    
    return {
        'model': model,
        'history': history,
        'predictions': y_pred.flatten(),
        'predictions_scaled': y_pred_scaled.flatten()
    }

In [138]:
def build_xgboost_model(prepared_data, max_depth=7, learning_rate=0.1, n_estimators=100, early_stopping_rounds=10):
    """
    Xây dựng và huấn luyện mô hình XGBoost với dữ liệu đã chuẩn hóa
    
    Parameters:
    -----------
    prepared_data : dict
        Dictionary chứa dữ liệu đã chuẩn bị cho XGBoost
    max_depth : int
        Độ sâu tối đa của cây
    learning_rate : float
        Tốc độ học của mô hình
    n_estimators : int
        Số lượng cây ước lượng
    early_stopping_rounds : int
        Số vòng đợi trước khi early stopping
        
    Returns:
    --------
    Mô hình XGBoost đã huấn luyện và kết quả dự đoán
    """
    # Lấy dữ liệu
    X_train = prepared_data['xgboost']['X_train']
    y_train = prepared_data['xgboost']['y_train']
    X_test = prepared_data['xgboost']['X_test']
    feature_names = prepared_data['xgboost']['feature_names']
    scaler_y = prepared_data['xgboost']['scaler_y']
    
    X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = train_test_split(
        X_train, y_train, test_size=0.1, random_state=42
    )
    
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        random_state=42,
        n_jobs=-1
    )

    model.fit(
        X_train_xgb, y_train_xgb,
        eval_set=[(X_val_xgb, y_val_xgb)],
        verbose=False
    )
    
    y_pred_scaled = model.predict(X_test)

    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    feature_importance = model.feature_importances_
    feature_importance_dict = {feature: importance for feature, importance in zip(feature_names, feature_importance)}
    
    return {
        'model': model,
        'predictions': y_pred,
        'predictions_scaled': y_pred_scaled,
        'feature_importance': feature_importance_dict
    }

In [155]:
def evaluate_models(actual, lstm_pred, xgb_pred):
    results = {}
    all_predicts = {}
    # Đánh giá LSTM
    results['lstm'] = {
        'rmse': np.sqrt(mean_squared_error(actual, lstm_pred)),
        'mae': mean_absolute_error(actual, lstm_pred),
        'r2': r2_score(actual, lstm_pred),
        'mse': mean_squared_error(actual, lstm_pred)
    }
    
    # Đánh giá XGBoost
    results['xgboost'] = {
        'rmse': np.sqrt(mean_squared_error(actual, xgb_pred)),
        'mae': mean_absolute_error(actual, xgb_pred),
        'r2': r2_score(actual, xgb_pred),
        'mse': mean_squared_error(actual, xgb_pred)
    }

    all_predicts = {"actual": actual.tolist(),
                    "lstm_predict": lstm_pred.tolist(),
                    "xgboost_predict": xgb_pred.tolist()}
    
    # So sánh hiệu suất
    print("Đánh giá mô hình LSTM:")
    print(f"RMSE: {results['lstm']['rmse']:.4f}")
    print(f"MAE: {results['lstm']['mae']:.4f}")
    print(f"R²: {results['lstm']['r2']:.4f}")
    print("\nĐánh giá mô hình XGBoost:")
    print(f"RMSE: {results['xgboost']['rmse']:.4f}")
    print(f"MAE: {results['xgboost']['mae']:.4f}")
    print(f"R²: {results['xgboost']['r2']:.4f}")
    
    return results, all_predicts

# Xây dựng model

In [160]:
import json
sequence_length = 60
results_summary = {}
for stock_code in ['TSLA', 'SSE', 'STAN', 'STJ', 'TSCO']:

    df = read_data(f'../data/{stock_code}.csv')
    datasets = prepare_data(df, sequence_length=sequence_length, train_size=350, test_size=125, step=125)
    
    stock_results = []
    for data in datasets:
        lstm_model = build_lstm_model(data, epochs=50, batch_size=16, patience=10)
        xgb_model = build_xgboost_model(data)
        
        results, all_predicts = evaluate_models(
            data['actual_test'],
            lstm_model['predictions'],
            xgb_model['predictions']
        )
        
        stock_results.append({
            'results': {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in results.items()},
            'predictions': {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in all_predicts.items()},
            'dates_test': list(data['dates_test'])
        })
    
    results_summary[stock_code] = stock_results
   

Epoch 1/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 106ms/step - loss: 0.2458 - val_loss: 0.0073
Epoch 2/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - loss: 0.0221 - val_loss: 0.0049
Epoch 3/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 74ms/step - loss: 0.0140 - val_loss: 0.0059
Epoch 4/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step - loss: 0.0093 - val_loss: 0.0141
Epoch 5/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 63ms/step - loss: 0.0079 - val_loss: 0.0070
Epoch 6/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 0.0070 - val_loss: 0.0035
Epoch 7/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - loss: 0.0061 - val_loss: 0.0032
Epoch 8/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - loss: 0.0074 - val_loss: 0.0030
Epoch 9/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━

In [161]:
with open('results_summary.json', 'w') as json_file:
    json.dump(results_summary, json_file, indent=4, default=str)

File json chứa thông tin kết quả chạy các mã cổ phiếu, trong đó kết quả của mỗi mã bao gồm:
- Lưu kết quả mỗi lần huấn luyện của 2 model (XGBoost) và LSTM, khoảng tgian test (125 ngày)
- Các thông số đánh giá RMSE, MAE, R2 MSE
- Giá trị thực tế và giá trị dự đoán của 2 model trong tgian test 