In [1]:
#  Imports
# ---------------------------------------------------------------------
import os  # For folder creation
import yfinance as yf
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle  # For saving/loading models
import json




# Download historical GOLDBEES ETF price data
# ---------------------------------------------------------------------
def download_gold_prices(start_date: datetime, end_date: datetime) -> pd.DataFrame:
    # Download historical GOLDBEES ETF price data
    print("\nStep 1: Downloading gold price data (GOLDBEES.BO)...")
    gold = yf.download('GOLDBEES.BO', start=start_date, end=end_date, progress=False)
    print("Download complete.")
    #print(gold.head())

    if isinstance(gold.columns, pd.MultiIndex):
        gold.columns = gold.columns.get_level_values(0)

    gold = gold[['Open', 'High', 'Low', 'Close', 'Volume']].copy()
    gold.columns.name = None  # Remove "Price" label from column index

    print("Current working directory:", os.getcwd())  
    # Ensure the 'Data' directory exists
    os.makedirs("Data", exist_ok=True)

    # Save raw data to CSV
    gold.to_csv("Data/GOLDBEES_ETF_price_data.csv")
    print("Saved raw gold price data to Data/GOLDBEES_ETF_price_data.csv")
    #print(gold.columns)
    #print(gold.head())
    return gold


# Technical Indicator Calculation
# ---------------------------------------------------------------------

#RSI Calculation
# ---------------------------------------------------------------------
def calculate_rsi(prices: pd.Series, period: int = 14) -> pd.Series:
    # Relative Strength Index calculation
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi


# Technical Indicator Calculation
# ---------------------------------------------------------------------
def add_technical_indicators(gold: pd.DataFrame) -> pd.DataFrame:
    # Technical indicators
    print("Adding technical indicators...")

    gold['Returns'] = gold['Close'].pct_change()
    gold['MA_5'] = gold['Close'].rolling(window=5).mean()
    gold['MA_20'] = gold['Close'].rolling(window=20).mean()
    gold['MA_50'] = gold['Close'].rolling(window=50).mean()
    gold['Volatility'] = gold['Returns'].rolling(window=20).std()
    gold['RSI'] = calculate_rsi(gold['Close'])

    print("Calculating Bollinger Bands...")
    rolling_std = gold['Close'].rolling(window=20).std()
    gold['BB_upper'] = gold['MA_20'] + (rolling_std * 2)
    gold['BB_lower'] = gold['MA_20'] - (rolling_std * 2)
    gold['BB_width'] = gold['BB_upper'] - gold['BB_lower']
    gold['BB_position'] = (gold['Close'] - gold['BB_lower']) / gold['BB_width']

    # MACD and Signal Line
    exp1 = gold['Close'].ewm(span=12, adjust=False).mean()
    exp2 = gold['Close'].ewm(span=26, adjust=False).mean()
    gold['MACD'] = exp1 - exp2
    gold['MACD_Signal'] = gold['MACD'].ewm(span=9, adjust=False).mean()
    gold['MACD_Hist'] = gold['MACD'] - gold['MACD_Signal']

    # Momentum (n-day price diff)
    gold['Momentum_10'] = gold['Close'] - gold['Close'].shift(10)

    # Rate of Change (ROC)
    gold['ROC_10'] = gold['Close'].pct_change(periods=10)


    # Drop NaNs and infinite values after all calculations
    gold.replace([np.inf, -np.inf], np.nan, inplace=True)
    gold.dropna(inplace=True)


    # Desired column order
    columns_order = [
        'Date', 'Open', 'High', 'Low', 'Close', 'Volume',
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility', 'RSI',
        'BB_upper', 'BB_lower', 'BB_width', 'BB_position',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    # Reorder and handle missing columns
    existing_cols = [col for col in columns_order if col in gold.columns]
    gold = gold[existing_cols]

    print(f"Added indicators to {len(gold)} rows.")

    # Save full DataFrame with indicators
    gold.to_csv("Data/GOLDBEES_ETF_price_data_technical_indicators.csv")
    print("Saved technical indicators to Data/GOLDBEES_ETF_price_data_technical_indicators.csv")

    return gold





#Add continuous sentiment Based on Price Trend with Labels
# ---------------------------------------------------------------------
def generate_sentiment_from_trend_with_labels(gold: pd.DataFrame, sentiment_today: float = 0.0, seed: int = 42) -> pd.DataFrame:
    """
    Generate numeric sentiment scores and sentiment labels based on price returns.

    Args:
        gold (pd.DataFrame): DataFrame with 'Close' column
        seed (int): Random seed for reproducibility

    Returns:
        pd.DataFrame: Updated DataFrame with 'Sentiment' and 'Sentiment_Label' columns
    """
    import random
    random.seed(seed)

    gold = gold.copy()
    gold['Returns'] = gold['Close'].pct_change()

    sentiment_scores = []
    sentiment_labels = []

    for ret in gold['Returns']:
        if pd.isna(ret):
            sentiment = 0.0
        elif ret > 0.01:
            sentiment = round(random.uniform(0.5, 1.0), 2)
        elif ret > 0.0:
            sentiment = round(random.uniform(0.1, 0.5), 2)
        elif ret > -0.01:
            sentiment = round(random.uniform(-0.5, -0.1), 2)
        else:
            sentiment = round(random.uniform(-1.0, -0.5), 2)

        # Assign label
        if sentiment > 0.1:
            label = 'positive'
        elif sentiment < -0.1:
            label = 'negative'
        else:
            label = 'neutral'

        sentiment_scores.append(sentiment)
        sentiment_labels.append(label)

    gold['Sentiment'] = sentiment_scores
    gold['Sentiment_Label'] = sentiment_labels

    # Inject today's sentiment into the last row
    #gold['Sentiment'] = 0.0  # Initialize all with neutral
    if not gold.empty:
        gold.iloc[-1, gold.columns.get_loc('Sentiment')] = sentiment_today
        print(f"Injected today's sentiment ({sentiment_today:+.2f}) into the last row.")

    # Save to CSV
    os.makedirs("Data", exist_ok=True)
    gold.to_csv("Data/GOLDBEES_ETF_price_data_technical_indicators_sentiment.csv")
    print("Sentiment columns added and saved to Data/GOLDBEES_ETF_price_data_technical_indicators_sentiment.csv")

    return gold





In [2]:
start = datetime(2010, 1, 1)
end = datetime(2026, 1, 1)
gold = download_gold_prices(start, end)
gold = add_technical_indicators(gold)
gold = generate_sentiment_from_trend_with_labels(gold)


Step 1: Downloading gold price data (GOLDBEES.BO)...
YF.download() has changed argument auto_adjust default to True
Download complete.
Current working directory: d:\Python\04_Python_DA225o\zDL_Project_Gold_Price_07_LangChain
Saved raw gold price data to Data/GOLDBEES_ETF_price_data.csv
Adding technical indicators...
Calculating Bollinger Bands...
Added indicators to 3743 rows.
Saved technical indicators to Data/GOLDBEES_ETF_price_data_technical_indicators.csv
Injected today's sentiment (+0.00) into the last row.
Sentiment columns added and saved to Data/GOLDBEES_ETF_price_data_technical_indicators_sentiment.csv


In [3]:
# Train ARIMAX with technical indicators and save the model.
# ---------------------------------------------------------------------
import os
import pickle
from datetime import datetime
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX

def train_arimax(df: pd.DataFrame, model_dir="Model/Arimax", arima_order=(1, 1, 1)):
    # -------------------------------
    # Step 1: Setup
    # -------------------------------
    #model_dir = "Model/Arimax"
    os.makedirs(model_dir, exist_ok=True)
    today_str = datetime.now().strftime("%Y-%m-%d")
    model_path = os.path.join(model_dir, f"arimax_{today_str}.pkl")

    # -------------------------------
    # Step 2: Define Exogenous Features
    # -------------------------------
    exog_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    for col in exog_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    df = df[['Close'] + exog_cols].dropna()
    df = df.asfreq('B')
    df.ffill(inplace=True)

    y = df['Close']
    exog = df[exog_cols]

    # -------------------------------
    # Step 3: # Train new model
    # -------------------------------
    # Clean old models
    for fname in os.listdir(model_dir):
        if fname.startswith("arimax_") and fname.endswith(".pkl"):
            os.remove(os.path.join(model_dir, fname))

    # Train new model
    model = SARIMAX(endog=y, exog=exog, order=arima_order,
                    enforce_stationarity=False, enforce_invertibility=False)
    model_fit = model.fit(disp=False, method='powell')

    with open(model_path, "wb") as f:
        pickle.dump(model_fit, f)
    print(f"Saved new ARIMAX model to: {model_path}")   
    return model_fit, model_path


In [4]:
model, path = train_arimax(gold, model_dir="Model/Arimax", arima_order=(1, 1, 1))

Saved new ARIMAX model to: Model/Arimax\arimax_2025-06-23.pkl


In [5]:
# Trains an XGBoost model using gold price and technical features, then saves the model.
# ---------------------------------------------------------------------
import os
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import RobustScaler
import xgboost as xgb


def train_xgboost_model(gold: pd.DataFrame, model_dir="Model/XGBoost", test_size: float = 0.2, random_state: int = 42):

    os.makedirs(model_dir, exist_ok=True)
    today_str = datetime.now().strftime("%Y-%m-%d")
    model_path = os.path.join(model_dir, f"xgboost_{today_str}.pkl")

    # Clean up old models in the directory
    for fname in os.listdir(model_dir):
        if fname.startswith("xgboost_") and fname.endswith(".pkl"):
            os.remove(os.path.join(model_dir, fname))

    # Feature engineering
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment',
        'MACD', 'MACD_Signal', 'MACD_Hist',
        'Momentum_10', 'ROC_10'
    ]

    gold_clean = gold[['Close'] + feature_cols].copy().dropna()
    gold_clean['Close_pct_change_1'] = gold_clean['Close'].pct_change(1)
    gold_clean['Close_pct_change_2'] = gold_clean['Close'].pct_change(2)
    gold_clean['Close_pct_change_3'] = gold_clean['Close'].pct_change(3)
    gold_clean['Close_rolling_std_5'] = gold_clean['Close'].rolling(5).std()
    gold_clean['Close_rolling_std_10'] = gold_clean['Close'].rolling(10).std()
    gold_clean['Close_vs_MA5'] = (gold_clean['Close'] - gold_clean['MA_5']) / gold_clean['MA_5']
    gold_clean['Close_vs_MA20'] = (gold_clean['Close'] - gold_clean['MA_20']) / gold_clean['MA_20']

    # Final features and target
    feature_cols_extended = feature_cols + [
        'Close_pct_change_1', 'Close_pct_change_2', 'Close_pct_change_3',
        'Close_rolling_std_5', 'Close_rolling_std_10',
        'Close_vs_MA5', 'Close_vs_MA20'
    ]

    gold_clean['Target_pct_change'] = gold_clean['Close'].pct_change().shift(-1)
    gold_clean['Target_price'] = gold_clean['Close'].shift(-1)
    gold_clean = gold_clean.dropna()
    gold_clean = gold_clean[
        (np.isfinite(gold_clean['Target_pct_change'])) &
        (np.abs(gold_clean['Target_pct_change']) < 1.0)
    ]

    # Prepare data
    X = gold_clean[feature_cols_extended]
    y_pct = gold_clean['Target_pct_change']

    split_idx = int(len(X) * (1 - test_size))
    X_train, y_train = X.iloc[:split_idx], y_pct.iloc[:split_idx]

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Train XGBoost
    model = xgb.XGBRegressor(
        n_estimators=400,
        max_depth=10,
        learning_rate=0.008,
        min_child_weight=1,
        subsample=0.95,
        colsample_bytree=0.9,
        reg_alpha=0.001,
        reg_lambda=0.01,
        gamma=0,
        random_state=random_state,
        objective='reg:squarederror',
        tree_method='hist'
    )

    print(" Training new XGBoost model...")
    model.fit(X_train_scaled, y_train)

    # Save the model
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)

    print(f"Model saved to {model_path}")
    return model, model_path


In [6]:
#from modules.model_train import train_xgboost_model
model, path = train_xgboost_model(gold, model_dir="Model/XGBoost")

 Training new XGBoost model...
Model saved to Model/XGBoost\xgboost_2025-06-23.pkl


In [7]:
# Train a Random Forest model on gold data with engineered features and save it to disk.
# ---------------------------------------------------------------------------
import os
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler


def train_random_forest_model(gold: pd.DataFrame, model_dir="Model/RandomForest", test_size: float = 0.2, random_state: int = 42):

    os.makedirs(model_dir, exist_ok=True)
    today_str = datetime.now().strftime("%Y-%m-%d")
    model_path = os.path.join(model_dir, f"random_forest_{today_str}.pkl")

    # Clean up old models
    for fname in os.listdir(model_dir):
        if fname.startswith("random_forest_") and fname.endswith(".pkl"):
            os.remove(os.path.join(model_dir, fname))

    # Feature engineering
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment'
    ]

    gold_clean = gold[['Close'] + feature_cols].copy().dropna()
    gold_clean['Close_pct_change_1'] = gold_clean['Close'].pct_change(1)
    gold_clean['Close_pct_change_2'] = gold_clean['Close'].pct_change(2)
    gold_clean['Close_pct_change_3'] = gold_clean['Close'].pct_change(3)
    gold_clean['Close_rolling_std_5'] = gold_clean['Close'].rolling(5).std()
    gold_clean['Close_rolling_std_10'] = gold_clean['Close'].rolling(10).std()
    gold_clean['Close_vs_MA5'] = (gold_clean['Close'] - gold_clean['MA_5']) / gold_clean['MA_5']
    gold_clean['Close_vs_MA20'] = (gold_clean['Close'] - gold_clean['MA_20']) / gold_clean['MA_20']
    gold_clean['Price_momentum_3'] = gold_clean['Close'] / gold_clean['Close'].shift(3) - 1
    gold_clean['Price_momentum_5'] = gold_clean['Close'] / gold_clean['Close'].shift(5) - 1

    feature_cols_extended = feature_cols + [
        'Close_pct_change_1', 'Close_pct_change_2', 'Close_pct_change_3',
        'Close_rolling_std_5', 'Close_rolling_std_10',
        'Close_vs_MA5', 'Close_vs_MA20',
        'Price_momentum_3', 'Price_momentum_5'
    ]

    # Prepare target
    gold_clean.dropna(inplace=True)
    gold_clean['Target_pct_change'] = gold_clean['Close'].pct_change().shift(-1)
    gold_clean['Target_price'] = gold_clean['Close'].shift(-1)
    gold_clean.dropna(inplace=True)
    gold_clean = gold_clean[(np.abs(gold_clean['Target_pct_change']) < 1.0)]

    X = gold_clean[feature_cols_extended]
    y = gold_clean['Target_pct_change']

    # Scaling
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    # Train model
    model = RandomForestRegressor(
        n_estimators=200, max_depth=15,
        min_samples_split=5, min_samples_leaf=2,
        max_features='sqrt', bootstrap=True,
        random_state=random_state, n_jobs=-1
    )

    print(" Training new Random Forest model...")
    model.fit(X_scaled, y)

    # Save model
    with open(model_path, "wb") as f:
        pickle.dump(model, f)

    print(f" Model saved to {model_path}")
    return model, model_path


In [8]:
#from modules.model_train import train_random_forest_model
model, path = train_random_forest_model(gold, model_dir="Model/RandomForest")

 Training new Random Forest model...
 Model saved to Model/RandomForest\random_forest_2025-06-23.pkl


In [9]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime


class GoldPriceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])


def train_lstm_model(gold: pd.DataFrame, model_dir="Model/LSTM", sequence_length=10, epochs=50, batch_size=16, lr=0.001):
    """
    Train and save an LSTM model using gold price data.

    Parameters:
        gold (pd.DataFrame): Input gold DataFrame with features.
        sequence_length (int): Number of past days used for each training sequence.
        epochs (int): Training epochs.
        batch_size (int): Batch size for training.
        lr (float): Learning rate.
        model_dir (str): Directory where model will be saved.

    Returns:
        model (LSTMModel): Trained model.
        model_path (str): Path to saved model.
    """
    os.makedirs(model_dir, exist_ok=True)
    today_str = datetime.now().strftime("%Y-%m-%d")
    model_path = os.path.join(model_dir, f"lstm_{today_str}.pt")

    # Feature selection
    feature_cols = [
        'Returns', 'MA_5', 'MA_20', 'MA_50', 'Volatility',
        'RSI', 'BB_upper', 'BB_lower', 'BB_width',
        'BB_position', 'Sentiment'
    ]

    gold = gold[['Close'] + feature_cols].dropna()
    gold = gold.asfreq('B')
    gold.ffill(inplace=True)
    gold['Target'] = gold['Close'].shift(-1)
    gold.dropna(inplace=True)

    # Scaling
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(gold[feature_cols])
    y_scaled = scaler.fit_transform(gold[['Target']])

    # Sequence creation
    X_seq, y_seq = [], []
    for i in range(len(X_scaled) - sequence_length):
        X_seq.append(X_scaled[i:i + sequence_length])
        y_seq.append(y_scaled[i + sequence_length])

    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)

    # Model training
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMModel(input_size=X_seq.shape[2]).to(device)

    # Remove previous models
    for f in os.listdir(model_dir):
        if f.startswith("lstm_") and f.endswith(".pt"):
            os.remove(os.path.join(model_dir, f))

    train_ds = GoldPriceDataset(X_seq, y_seq)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            output = model(xb).squeeze()
            loss = criterion(output, yb.squeeze())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if epoch % 10 == 0 or epoch == epochs - 1:
            print(f"Epoch {epoch} - Avg Loss: {total_loss / len(train_dl):.6f}")

    # Save model
    torch.save(model.state_dict(), model_path)
    print(f" LSTM model saved to: {model_path}")
    return model, model_path


In [10]:
#from modules.model_train import train_lstm_model
model, path = train_lstm_model(gold, model_dir="Model/LSTM", sequence_length=10, epochs=50)

Epoch 0 - Avg Loss: 0.002905
Epoch 10 - Avg Loss: 0.000179
Epoch 20 - Avg Loss: 0.000151
Epoch 30 - Avg Loss: 0.000149
Epoch 40 - Avg Loss: 0.000143
Epoch 49 - Avg Loss: 0.000134
 LSTM model saved to: Model/LSTM\lstm_2025-06-23.pt
