In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

In [None]:

df = pd.read_csv('/content/drive/MyDrive/data scientist/FinanceResearch/data/TSLA.csv')

In [None]:
df.head(15)

Unnamed: 0,Date,Close,lag_1,lag_5,lag_10,r1,r2,r3,r4,r5,...,r11,r12,r13,r14,r15,RSI,Momentum,TrueRange,ATR,ParabolicSAR
0,2010-06-29,1.588667,1.592667,1.592667,1.592667,-0.002515,-0.002515,-0.002515,-0.002515,0.274437,...,0.274437,-0.079956,-0.079956,-0.079956,-0.079956,43.408794,-0.27,0.497334,0.211714,1.592667
1,2010-06-30,1.464,1.588667,1.592667,1.592667,-0.002515,-0.002515,-0.002515,-0.002515,0.165114,...,0.274437,-0.101533,-0.079956,-0.079956,-0.079956,43.408794,-0.27,0.474667,0.211714,1.588667
2,2010-07-01,1.28,1.464,1.592667,1.592667,-0.081723,-0.002515,-0.002515,-0.002515,0.036139,...,0.274437,-0.209734,-0.101533,-0.079956,-0.079956,43.408794,-0.27,0.376667,0.211714,1.464
3,2010-07-02,1.074,1.28,1.592667,1.592667,-0.134312,-0.081723,-0.002515,-0.002515,0.004339,...,0.274437,-0.206436,-0.209734,-0.101533,-0.079956,43.408794,-0.27,0.292667,0.211714,1.28
4,2010-07-06,1.053333,1.074,1.592667,1.592667,-0.17547,-0.134312,-0.081723,-0.002515,0.0,...,0.165114,-0.233825,-0.206436,-0.209734,-0.101533,43.408794,-0.27,0.278,0.211714,1.074
5,2010-07-07,1.164,1.053333,1.588667,1.592667,-0.019431,-0.17547,-0.134312,-0.081723,0.013928,...,0.036139,-0.090565,-0.233825,-0.206436,-0.209734,43.408794,-0.27,0.11,0.211714,1.053333
6,2010-07-08,1.16,1.164,1.464,1.592667,0.099903,-0.019431,-0.17547,-0.134312,0.082042,...,0.004339,-0.035955,-0.090565,-0.233825,-0.206436,43.408794,-0.27,0.13,0.211714,1.164
7,2010-07-09,1.136667,1.16,1.28,1.592667,-0.003442,0.099903,-0.019431,-0.17547,0.018039,...,0.0,-0.060376,-0.035955,-0.090565,-0.233825,43.408794,-0.27,0.09,0.211714,1.16
8,2010-07-12,1.209333,1.136667,1.074,1.592667,-0.02032,-0.003442,0.099903,-0.019431,0.006663,...,0.013928,-0.054377,-0.060376,-0.035955,-0.090565,43.408794,-0.27,0.071334,0.211714,1.136667
9,2010-07-13,1.322667,1.209333,1.053333,1.592667,0.061969,-0.02032,-0.003442,0.099903,0.069415,...,0.082042,-0.028581,-0.054377,-0.060376,-0.035955,43.408794,-0.27,0.116,0.211714,1.209333


In [None]:
df.isna().sum()

Unnamed: 0,0
Date,0
Close,1
lag_1,0
lag_5,0
lag_10,0
r1,0
r2,0
r3,0
r4,0
r5,0


In [None]:
def prepare_data(stock_data, sequence_length=10, target_column='Close', train_size=325, test_size=125, step=125):
    """
    Chuẩn bị dữ liệu cho mô hình LSTM và XGBoost, bao gồm chuẩn hóa và dịch chuyển tập huấn luyện, kiểm tra.

    Parameters:
    -----------
    stock_data : pandas DataFrame
        Dữ liệu cổ phiếu đã được xử lý
    sequence_length : int
        Độ dài chuỗi thời gian cho LSTM
    target_column : str
        Tên cột dữ liệu mục tiêu cần dự đoán
    train_size : int
        Số lượng mẫu dành cho tập huấn luyện
    test_size : int
        Số lượng mẫu dành cho tập kiểm tra
    step : int
        Số lượng mẫu dịch chuyển giữa các lần huấn luyện

    Returns:
    --------
    List chứa các bộ dữ liệu huấn luyện và kiểm tra đã xử lý cho LSTM và XGBoost
    """
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    stock_data = stock_data.sort_values('Date').reset_index(drop=True)
    datasets = []

    for start_idx in range(0, len(stock_data) - (train_size + test_size), step):
        df_train = stock_data.iloc[start_idx:start_idx + train_size]
        df_test = stock_data.iloc[start_idx + train_size - sequence_length:start_idx + train_size + test_size]

        feature_columns = [col for col in stock_data.columns if col not in ['Date', target_column]]

        X_train = df_train[feature_columns].values
        X_test = df_test[feature_columns].values
        y_train = df_train[target_column].values
        y_test = df_test[target_column].values

        # Chuẩn hóa dữ liệu
        scaler_X = MinMaxScaler(feature_range=(0, 1))
        scaler_y = MinMaxScaler(feature_range=(0, 1))

        X_train_scaled = scaler_X.fit_transform(X_train)
        X_test_scaled = scaler_X.transform(X_test)

        y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
        y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))

        # Chuẩn bị dữ liệu cho XGBoost
        X_train_xgb = X_train_scaled
        X_test_xgb = X_test_scaled
        y_train_xgb = y_train_scaled.flatten()
        y_test_xgb = y_test_scaled.flatten()

        # Chuẩn bị dữ liệu cho LSTM
        X_sequences, y_sequences = [], []
        for i in range(len(X_train_scaled) - sequence_length):
            X_sequences.append(X_train_scaled[i:i + sequence_length])
            y_sequences.append(y_train_scaled[i + sequence_length])

        X_sequences = np.array(X_sequences)
        y_sequences = np.array(y_sequences)

        X_test_sequences, y_test_sequences = [], []
        for i in range(len(X_test_scaled) - sequence_length):
            X_test_sequences.append(X_test_scaled[i:i + sequence_length])
            y_test_sequences.append(y_test_scaled[i + sequence_length])

        X_test_sequences = np.array(X_test_sequences)
        y_test_sequences = np.array(y_test_sequences)

        datasets.append({
            'xgboost': {
                'X_train': X_train_xgb,
                'y_train': y_train_xgb,
                'X_test': X_test_xgb[sequence_length:],
                'y_test': y_test_xgb[sequence_length:],
                'feature_names': feature_columns,
                'scaler_X': scaler_X,
                'scaler_y': scaler_y
            },
            'lstm': {
                'X_train': X_sequences,
                'y_train': y_sequences,
                'X_test': X_test_sequences,
                'y_test': y_test_sequences,
                'scaler_y': scaler_y,
                'scaler_X': scaler_X,
                'sequence_length': sequence_length
            },
            'dates_test': df_test['Date'].values[sequence_length:],
            'actual_test': y_test[sequence_length:]
        })

    return datasets


In [None]:
def build_lstm_model(prepared_data, units=50, dropout_rate=0.2, learning_rate=0.001, epochs=100, batch_size=32, patience=10):
    """
    Xây dựng và huấn luyện mô hình LSTM

    Parameters:
    -----------
    prepared_data : dict
        Dictionary chứa dữ liệu đã chuẩn bị cho LSTM
    units : int
        Số lượng units trong layer LSTM
    dropout_rate : float
        Tỷ lệ dropout để tránh overfitting
    learning_rate : float
        Tốc độ học của optimizer
    epochs : int
        Số lượng epochs huấn luyện
    batch_size : int
        Kích thước batch
    patience : int
        Số epochs đợi trước khi early stopping

    Returns:
    --------
    Mô hình LSTM đã huấn luyện và kết quả dự đoán
    """

    X_train = prepared_data['lstm']['X_train']
    y_train = prepared_data['lstm']['y_train']
    X_test = prepared_data['lstm']['X_test']
    scaler_y = prepared_data['lstm']['scaler_y']


    input_shape = (X_train.shape[1], X_train.shape[2])

    model = Sequential()
    model.add(LSTM(units=units, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=units, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=1))

    model.compile(optimizer='adam', loss='mean_squared_error')

    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.1,
        callbacks=[early_stopping],
        verbose=1
    )

    y_pred_scaled = model.predict(X_test)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    return {
        'model': model,
        'history': history,
        'predictions': y_pred.flatten(),
        'predictions_scaled': y_pred_scaled.flatten()
    }

In [None]:
def build_xgboost_model(prepared_data, max_depth=7, learning_rate=0.1, n_estimators=100, early_stopping_rounds=10):
    """
    Xây dựng và huấn luyện mô hình XGBoost với dữ liệu đã chuẩn hóa

    Parameters:
    -----------
    prepared_data : dict
        Dictionary chứa dữ liệu đã chuẩn bị cho XGBoost
    max_depth : int
        Độ sâu tối đa của cây
    learning_rate : float
        Tốc độ học của mô hình
    n_estimators : int
        Số lượng cây ước lượng
    early_stopping_rounds : int
        Số vòng đợi trước khi early stopping

    Returns:
    --------
    Mô hình XGBoost đã huấn luyện và kết quả dự đoán
    """
    # Lấy dữ liệu
    X_train = prepared_data['xgboost']['X_train']
    y_train = prepared_data['xgboost']['y_train']
    X_test = prepared_data['xgboost']['X_test']
    feature_names = prepared_data['xgboost']['feature_names']
    scaler_y = prepared_data['xgboost']['scaler_y']

    X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = train_test_split(
        X_train, y_train, test_size=0.1, random_state=42
    )

    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        random_state=42,
        n_jobs=-1
    )

    model.fit(
        X_train_xgb, y_train_xgb,
        eval_set=[(X_val_xgb, y_val_xgb)],
        verbose=False
    )

    y_pred_scaled = model.predict(X_test)

    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    feature_importance = model.feature_importances_
    feature_importance_dict = {feature: importance for feature, importance in zip(feature_names, feature_importance)}

    return {
        'model': model,
        'predictions': y_pred,
        'predictions_scaled': y_pred_scaled,
        'feature_importance': feature_importance_dict
    }

In [None]:
def evaluate_models(actual, lstm_pred, xgb_pred):
    results = {}
    all_predicts = {}
    # Đánh giá LSTM
    results['lstm'] = {
        'rmse': np.sqrt(mean_squared_error(actual, lstm_pred)),
        'mae': mean_absolute_error(actual, lstm_pred),
        'r2': r2_score(actual, lstm_pred),
        'mse': mean_squared_error(actual, lstm_pred)
    }

    # Đánh giá XGBoost
    results['xgboost'] = {
        'rmse': np.sqrt(mean_squared_error(actual, xgb_pred)),
        'mae': mean_absolute_error(actual, xgb_pred),
        'r2': r2_score(actual, xgb_pred),
        'mse': mean_squared_error(actual, xgb_pred)
    }

    all_predicts = {"actual": actual.tolist(),
                    "lstm_predict": lstm_pred.tolist(),
                    "xgboost_predict": xgb_pred.tolist()}

    # So sánh hiệu suất
    print("Đánh giá mô hình LSTM:")
    print(f"RMSE: {results['lstm']['rmse']:.4f}")
    print(f"MAE: {results['lstm']['mae']:.4f}")
    print(f"R²: {results['lstm']['r2']:.4f}")
    print("\nĐánh giá mô hình XGBoost:")
    print(f"RMSE: {results['xgboost']['rmse']:.4f}")
    print(f"MAE: {results['xgboost']['mae']:.4f}")
    print(f"R²: {results['xgboost']['r2']:.4f}")

    return results, all_predicts

# Xây dựng model

In [None]:
import json
sequence_length = 60
results_summary = {}
for stock_code in ['TSLA', 'SSE', 'STAN', 'STJ', 'TSCO']:
    print(f"⏳ PROCESSING!, Processing stock code: {stock_code}")
    df = pd.read_csv(f'/content/drive/MyDrive/data scientist/FinanceResearch/data/{stock_code}.csv')
    datasets = prepare_data(df, sequence_length=sequence_length, train_size=750, test_size=250, step=250)

    stock_results = []
    for data in datasets:
        print("🚀 Running lstm")
        lstm_model = build_lstm_model(data, epochs=50, batch_size=16, patience=10)
        print("🚀 Running xgboost")
        xgb_model = build_xgboost_model(data)

        print("🚀 Evaluating")
        results, all_predicts = evaluate_models(
            data['actual_test'],
            lstm_model['predictions'],
            xgb_model['predictions']
        )

        stock_results.append({
            'results': {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in results.items()},
            'predictions': {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in all_predicts.items()},
            'dates_test': list(data['dates_test'])
        })

    results_summary[stock_code] = stock_results
    print(f"✅ DONE! Finished processing stock code: {stock_code}")


Epoch 1/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 55ms/step - loss: 0.0379 - val_loss: 0.1733
Epoch 2/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - loss: 0.0038 - val_loss: 0.1474
Epoch 3/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - loss: 0.0026 - val_loss: 0.1097
Epoch 4/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - loss: 0.0017 - val_loss: 0.0765
Epoch 5/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - loss: 0.0014 - val_loss: 0.0818
Epoch 6/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step - loss: 0.0013 - val_loss: 0.0806
Epoch 7/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - loss: 0.0011 - val_loss: 0.0728
Epoch 8/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - loss: 6.8525e-04 - val_loss: 0.0671
Epoch 9/50
[1m39/39[0m [32m━━━━━━━━━━━━━━



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step
Đánh giá mô hình LSTM:
RMSE: 0.8276
MAE: 0.6730
R²: 0.7947

Đánh giá mô hình XGBoost:
RMSE: 0.4642
MAE: 0.3521
R²: 0.9354
Epoch 1/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 53ms/step - loss: 0.1142 - val_loss: 0.0226
Epoch 2/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - loss: 0.0148 - val_loss: 0.0065
Epoch 3/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - loss: 0.0110 - val_loss: 0.0059
Epoch 4/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 70ms/step - loss: 0.0093 - val_loss: 0.0048
Epoch 5/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - loss: 0.0093 - val_loss: 0.0134
Epoch 6/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - loss: 0.0087 - val_loss: 0.0041
Epoch 7/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - loss:



[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step
Đánh giá mô hình LSTM:
RMSE: 0.9137
MAE: 0.6444
R²: 0.9095

Đánh giá mô hình XGBoost:
RMSE: 1.6908
MAE: 0.9287
R²: 0.6900
Epoch 1/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 80ms/step - loss: 0.0694 - val_loss: 0.0732
Epoch 2/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step - loss: 0.0072 - val_loss: 0.0183
Epoch 3/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - loss: 0.0042 - val_loss: 0.0235
Epoch 4/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - loss: 0.0045 - val_loss: 0.0188
Epoch 5/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - loss: 0.0032 - val_loss: 0.0195
Epoch 6/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - loss: 0.0035 - val_loss: 0.0257
Epoch 7/50
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 71ms/step - loss:

In [None]:
with open('/content/drive/MyDrive/data scientist/FinanceResearch/results_summary.json', 'w') as json_file:
    json.dump(results_summary, json_file, indent=4, default=str)

File json chứa thông tin kết quả chạy các mã cổ phiếu, trong đó kết quả của mỗi mã bao gồm:
- Lưu kết quả mỗi lần huấn luyện của 2 model (XGBoost) và LSTM, khoảng tgian test (250 ngày)
- Các thông số đánh giá RMSE, MAE, R2 MSE
- Giá trị thực tế và giá trị dự đoán của 2 model trong tgian test