In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
data = pd.read_csv('final3.csv')

""" data.drop(['sessions'], axis=1, inplace=True)
data.drop(['failed_sessions'], axis=1, inplace=True) """

# Assuming the last column is the target variable
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [29]:
def objective(trial):
    # Define hyperparameters to tune
    params = {
        'objective': 'reg:squarederror',  # For regression tasks
        # 'objective': 'binary:logistic',  # For binary classification
        # 'objective': 'multi:softmax',   # For multi-class classification
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'random_state': 42,
    }

    # Train the model
    model = xgb.XGBRegressor(**params)  # Use XGBClassifier for classification
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)  # Calculate MSE
    rmse = mse ** 0.5  # Manually calculate RMSE
    return rmse  # Minimize RMSE (for regression)
    # For classification, use accuracy or log loss:
    # y_pred = model.predict(X_test)
    # accuracy = accuracy_score(y_test, y_pred)
    # return 1 - accuracy  # Minimize (1 - accuracy)

In [31]:
study = optuna.create_study(direction='minimize')  # Minimize RMSE (or maximize accuracy)
study.optimize(objective, n_trials=100)  # Number of trials

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2025-03-06 18:22:48,385] A new study created in memory with name: no-name-9ea3a5e0-a192-4400-ae3c-df4818879ae3
[I 2025-03-06 18:22:49,331] Trial 0 finished with value: 0.23362947146206975 and parameters: {'max_depth': 3, 'learning_rate': 0.14573527947398127, 'subsample': 0.9810003227054118, 'colsample_bytree': 0.6666220547951194, 'min_child_weight': 5, 'gamma': 0.43080990938053465, 'reg_alpha': 0.6465124901717071, 'reg_lambda': 0.42332534895593743, 'n_estimators': 573}. Best is trial 0 with value: 0.23362947146206975.
[I 2025-03-06 18:22:49,847] Trial 1 finished with value: 0.22601246659847368 and parameters: {'max_depth': 5, 'learning_rate': 0.20454101872586136, 'subsample': 0.8891322200007145, 'colsample_bytree': 0.5788240115799588, 'min_child_weight': 8, 'gamma': 0.4897961971424112, 'reg_alpha': 0.49610855027165535, 'reg_lambda': 0.25203040892165707, 'n_estimators': 551}. Best is trial 1 with value: 0.22601246659847368.
[I 2025-03-06 18:22:50,347] Trial 2 finished with value: 0.2

Best hyperparameters: {'max_depth': 7, 'learning_rate': 0.08491960900367897, 'subsample': 0.6942274903471605, 'colsample_bytree': 0.8922150526995536, 'min_child_weight': 1, 'gamma': 0.11768244436022937, 'reg_alpha': 0.2283606338300546, 'reg_lambda': 0.051281448738075434, 'n_estimators': 345}
Best RMSE: 0.21285589617319983


In [32]:
best_params = study.best_params

# Add fixed parameters (e.g., objective, random_state)
best_params['objective'] = 'reg:squarederror'  # Adjust for your task
best_params['random_state'] = 42

# Train the final model
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

# Evaluate on the test set
y_pred = final_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)  # Calculate MSE
rmse = mse ** 0.5  # Manually calculate RMSE
print(f"Final RMSE: {rmse}")

Final RMSE: 0.21285589617319983


In [26]:
final_model.save_model('final.model')

