In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

import matplotlib.pyplot as plt

from modules.utils import filter_data_by_year_month, create_features_from_past, create_X_y

In [2]:
X = pd.read_csv('/Users/elouan/Repo Github ElouanBahri/Predicting_crypto_prices/Historical Prices for BTCUSDT')
YEARS = [2019,2021,2022,2023,2024]

Data = filter_data_by_year_month(X, YEARS)

Data1 = create_features_from_past(Data,['close', 'open', 'high', 'low', 'volume'], 4)

X,y = create_X_y(Data1)

In [3]:
# Assuming X and y are your features and target arrays
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the remaining 40% into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print shapes to confirm
print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Testing set:", X_test.shape, y_test.shape)

Training set: (105063, 20) (105063,)
Validation set: (35021, 20) (35021,)
Testing set: (35021, 20) (35021,)


----------

XGBoost Model

In [None]:

# Initialize the XGBRegressor
model = XGBRegressor(
    n_estimators=100,       # Number of trees (boosting rounds)
    max_depth=6,            # Maximum depth of each tree
    learning_rate=0.1,      # Step size shrinkage
    random_state=42         # Seed for reproducibility
)

# Fit the model to training data
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)

# Evaluate with MAE
mae = mean_absolute_error(y_val, y_pred)
print(f"Validation MAE: {mae}")


In [None]:
model.save_model('../models/xgboost_model.json')

In [9]:
model = XGBRegressor()
model.load_model('../models/xgboost_model_1.json')
print(model.get_params())

{'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'callbacks': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'gamma': 0, 'gpu_id': -1, 'grow_policy': 'depthwise', 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.1, 'max_bin': 256, 'max_cat_to_onehot': 4, 'max_delta_step': 0, 'max_depth': 6, 'max_leaves': 0, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 100, 'n_jobs': 0, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 1, 'sampling_method': 'uniform', 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'auto', 'validate_parameters': 1, 'verbosity': None}


In [10]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate with MAE
mae = mean_absolute_error(y_pred, y_test)
print(f"Validation MAE: {mae}")

Validation MAE: 82.72917746478228


-------------

Randomized Grid Search

In [None]:
base_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 100,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'random_state': 42
}


In [None]:


# Base model
xgb_model = XGBRegressor(**base_params)

# Define parameter grid for randomized search
param_dist = {
    'learning_rate': [0.05, 0.1, 0.15],       # Vary learning rate
    'max_depth': [4, 6, 8, 10],               # Adjust depth
    'n_estimators': [50, 100, 200, 300],      # Number of trees
    'min_child_weight': [1, 3, 5],            # Minimum sum of weights
    'gamma': [0, 0.1, 0.3, 0.5],              # Minimum loss reduction
    'subsample': [0.8, 1.0],                  # Fraction of samples per tree
    'colsample_bytree': [0.8, 1.0],           # Fraction of features per tree
    'reg_alpha': [0, 0.1, 0.5, 1],            # L1 regularization
    'reg_lambda': [1, 5, 10],                 # L2 regularization
}

# Randomized search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,                                # Number of parameter settings sampled
    scoring='neg_mean_absolute_error',        # Scoring metric
    cv=3,                                     # 3-fold cross-validation
    verbose=2,                                # Show progress
    random_state=42,                          # For reproducibility
    n_jobs=-1                                 # Use all available cores
)

# Fit the randomized search
random_search.fit(X_train, y_train)

# Display the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)


-------

GridResearch

In [5]:


# Initialize the XGBRegressor
xgb_model = XGBRegressor(
    random_state=42,
    tree_method='auto'  # Use 'gpu_hist' for GPU acceleration if GPU is available
)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 400],           # Number of boosting rounds
    'max_depth': [3, 5, 7],                   # Maximum depth of a tree
    'learning_rate': [0.01, 0.1, 0.3],        # Learning rate (shrinkage)
    'subsample': [0.8, 1.0],                  # Subsample ratio of the training set
    'colsample_bytree': [0.6, 0.8, 1.0],      # Subsample ratio of columns per tree
    'gamma': [0, 0.1, 0.2],                   # Minimum loss reduction required to split
    'min_child_weight': [1, 3, 5],            # Minimum sum of weights of all observations in a child
}

# Perform grid search with GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Scoring metric: negative MAE
    cv=3,                               # 3-fold cross-validation
    verbose=3,                          # Display progress
    n_jobs=-1                           # Use all CPU cores for parallelism
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Output best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)


Fitting 3 folds for each of 1458 candidates, totalling 4374 fits


[CV 1/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=-13072.110 total time=  20.1s
[CV 3/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=-12976.329 total time=  20.2s
[CV 2/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=-13015.319 total time=  20.5s
[CV 1/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=-13071.312 total time=  21.8s
[CV 3/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=-12976.360 total time=  21.8s
[CV 2/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=-13014.826 total time=  22.3

KeyboardInterrupt: 

---------