In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import plotly.graph_objs as go

In [3]:
def load_data(path:str) -> pd.DataFrame:
    df = pd.read_csv(path, index_col=0)
    return df

In [4]:
def preprocess_data(df, column='Close', train_size=0.8):
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

    data = df[[column]].values

    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(data)

    train_data_len = int(np.ceil(len(scaled_data) * train_size))
    train_data = scaled_data[0:int(train_data_len), :]
    test_data = scaled_data[train_data_len - 60:, :]

    return train_data, test_data, scaler

In [5]:
def create_dataset(dataset, time_step=60):
    X, y = [], []
    for i in range(len(dataset) - time_step):
        X.append(dataset[i:(i + time_step), 0])
        y.append(dataset[i + time_step, 0])
    return np.array(X), np.array(y)

In [6]:
def build_xgboost_model():
    # Create and return an XGBoost model with specific parameters
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=3,
        subsample=0.8,
        colsample_bytree=0.8
    )
    return model

In [7]:
def plot_predictions_with_plotly(train_data, valid_data, predictions):
    train_trace = go.Scatter(x=train_data.index, y=train_data['Close'], mode='lines', name='Train')
    valid_trace = go.Scatter(x=valid_data.index, y=valid_data['Close'], mode='lines', name='Validation')
    prediction_trace = go.Scatter(x=valid_data.index, y=valid_data['Predictions'], mode='lines', name='Predictions')

    layout = go.Layout(
        title='XGBoost Model Predictions',
        xaxis=dict(title='Date'),
        yaxis=dict(title='Close Price USD ($)'),
        hovermode='x unified'
    )

    fig = go.Figure(data=[train_trace, valid_trace, prediction_trace], layout=layout)
    fig.show()

In [10]:
def main(predict_days:int, path:str):
    df = load_data(path)
    train_data, test_data, scaler = preprocess_data(df)

    X_train, y_train = create_dataset(train_data)
    X_test, y_test = create_dataset(test_data)

    model = build_xgboost_model()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    predictions = scaler.inverse_transform(predictions.reshape(-1, 1))

    train = df[:len(train_data)]
    valid = df[len(train_data):]
    valid['Predictions'] = predictions

    plot_predictions_with_plotly(train, valid, predictions)

    last_60_days = test_data[-60:].reshape(1, -1)
    future_predictions = []

    for _ in range(predict_days):
        pred = model.predict(last_60_days)
        last_60_days = np.append(last_60_days[:, 1:], pred).reshape(1, -1)
        future_pred = scaler.inverse_transform(pred.reshape(-1, 1))
        future_predictions.append(future_pred[0][0])
        print(f"Predicted Close: {future_pred[0][0]}")

    return future_predictions

# Run the script
main(predict_days=5, path='../dados/raw/GLD.csv')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Predicted Close: 183.8203582763672
Predicted Close: 183.8203582763672
Predicted Close: 183.8203582763672
Predicted Close: 183.8203582763672
Predicted Close: 183.8203582763672


[183.82036, 183.82036, 183.82036, 183.82036, 183.82036]