In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

import matplotlib.pyplot as plt

from modules.utils import filter_data_by_year_month, create_features_from_past, create_X_y

In [8]:
X = pd.read_csv('/Users/elouan/Repo Github ElouanBahri/Predicting_crypto_prices/Historical Prices for BTCUSDT')
YEARS = [2019,2021,2022,2023,2024,2025]

Data = filter_data_by_year_month(X, YEARS)

Data1 = create_features_from_past(Data,['close', 'open', 'high', 'low', 'volume'], 8)

X,y = create_X_y(Data1)

In [9]:
# Assuming X and y are your features and target arrays
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Print shapes to confirm
print("Training set:", X_train.shape, y_train.shape)

print("Testing set:", X_test.shape, y_test.shape)

Training set: (141746, 40) (141746,)
Testing set: (35437, 40) (35437,)


----------

XGBoost Model

In [11]:

# Initialize the XGBRegressor
model = XGBRegressor(
    n_estimators=100,       # Number of trees (boosting rounds)
    max_depth=6,            # Maximum depth of each tree
    learning_rate=0.1,      # Step size shrinkage
    random_state=42         # Seed for reproducibility
)

# Fit the model to training data
model.fit(X_train, y_train)



In [12]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate with MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Validation MAE: {mae}")


Validation MAE: 84.42158872804862


In [13]:
model.save_model('../models/xgboost_model_5(8).json')

In [35]:
model = XGBRegressor()
model.load_model('../models/xgboost_model_4.json')
print(model.get_params())

{'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'callbacks': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'gamma': 0, 'gpu_id': -1, 'grow_policy': 'depthwise', 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.1, 'max_bin': 256, 'max_cat_to_onehot': 4, 'max_delta_step': 0, 'max_depth': 6, 'max_leaves': 0, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 100, 'n_jobs': 0, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 1, 'sampling_method': 'uniform', 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'auto', 'validate_parameters': 1, 'verbosity': None}


-------------

Randomized Grid Search

In [32]:
base_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,          # Reasonable default learning rate
    'max_depth': 6,                # Medium tree depth
    'n_estimators': 100,           # Default number of trees
    'min_child_weight': 1,         # Default for child weight
    'subsample': 0.8,              # Common default for subsampling
    'colsample_bytree': 0.8,       # Subsampling for features
    'reg_alpha': 0,                # No L1 regularization by default
    'reg_lambda': 1,               # Default L2 regularization
    'random_state': 42,            # For reproducibility
    'tree_method': 'auto',         # Choose best tree method
    'n_jobs': -1                   # Use all cores
}

                  # L2 regularization

param_grid = {
    'n_estimators': [50, 100, 200],         # Number of boosting rounds
    'max_depth': [4, 6, 8],                   # Maximum depth of a tree
    'learning_rate': [0.05, 0.1, 0.2],        # Learning rate (shrinkage)
    'subsample': [0.8, 1.0],                  # Subsample ratio of the training set
    'colsample_bytree': [0.8, 1.0],        # Subsample ratio of columns per tree
    'gamma': [0, 0.1, 0.2],                   # Minimum loss reduction required to split
    'min_child_weight': [1, 3, 5],            # Minimum sum of weights of all observations in a child
}




In [34]:


# Base model
xgb_model = XGBRegressor(**base_params)

# Randomized search
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=20,                                # Reduced number of samples
    scoring='neg_mean_absolute_error',        # MAE as the scoring metric
    cv=3,                                     # 3-fold cross-validation
    verbose=3,                                # Moderate verbosity
    random_state=42,                          # Reproducibility
    n_jobs=-1                                 # Use all cores
)

# Fit the randomized search
random_search.fit(X_train, y_train)

# Display the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)



Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 3/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=200, subsample=0.8;, score=-105.641 total time= 1.6min
[CV 1/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=200, subsample=0.8;, score=-106.826 total time= 1.7min
[CV 2/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=200, subsample=0.8;, score=-106.480 total time= 1.7min
[CV 1/3] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.05, max_depth=8, min_child_weight=3, n_estimators=100, subsample=1.0;, score=-226.149 total time= 2.1min
[CV 2/3] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.05, max_depth=8, min_child_weight=3, n_estimators=100, subsample=1.0;, score=-226.159 total time= 2.1min
[CV 1/3] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.2, max_depth=4, min_child_weight=3, n_estimators=20

-------

GridResearch

In [29]:


# Initialize the XGBRegressor
xgb_model = XGBRegressor(
    random_state=42,
    tree_method='auto'  # Use 'gpu_hist' for GPU acceleration if GPU is available
)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 400],           # Number of boosting rounds
    'max_depth': [3, 5, 7],                   # Maximum depth of a tree
    'learning_rate': [0.01, 0.1, 0.3],        # Learning rate (shrinkage)
    'subsample': [0.8, 1.0],                  # Subsample ratio of the training set
    'colsample_bytree': [0.6, 0.8, 1.0],      # Subsample ratio of columns per tree
    'gamma': [0, 0.1, 0.2],                   # Minimum loss reduction required to split
    'min_child_weight': [1, 3, 5],            # Minimum sum of weights of all observations in a child
}

# Perform grid search with GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Scoring metric: negative MAE
    cv=3,                               # 3-fold cross-validation
    verbose=3,                          # Display progress
    n_jobs=-1                           # Use all CPU cores for parallelism
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Output best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)


Fitting 3 folds for each of 1458 candidates, totalling 4374 fits


[CV 3/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=-13245.513 total time=  28.7s
[CV 1/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=-13323.571 total time=  29.1s
[CV 2/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=-13315.523 total time=  29.2s
[CV 1/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=-13321.818 total time=  31.4s
[CV 2/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=-13313.120 total time=  31.7s
[CV 3/3] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=-13244.676 total time=  31.7

KeyboardInterrupt: 

---------