In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score
import ray
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.search import ConcurrencyLimiter
from ray.tune.schedulers import ASHAScheduler

# Initialize Ray
ray.init()

# Load data from CSV files
train_df = pd.read_csv("train.csv", header=None)
valid_df = pd.read_csv("validation.csv", header=None)
test_df = pd.read_csv("test.csv", header=None)

# Separate features and target variables for training, validation, and testing datasets
x_train, y_train = train_df.iloc[:, 1:], train_df.iloc[:, 0]
x_valid, y_valid = valid_df.iloc[:, 1:], valid_df.iloc[:, 0]
x_test, y_test = test_df.iloc[:, 1:], test_df.iloc[:, 0]

# Combine training and validation data for final training
X_combined = pd.concat([x_train, x_valid])
y_combined = pd.concat([y_train, y_valid])

# Put large datasets in the Ray object store to manage memory usage
x_train_ref = ray.put(x_train)
y_train_ref = ray.put(y_train)
x_valid_ref = ray.put(x_valid)
y_valid_ref = ray.put(y_valid)


# Define the objective function for Ray Tune
def objective(config):
    # Define the parameters for the XGBoost model
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "booster": "gbtree",
        "max_depth": 3,  # Fixed max_depth to 3
        "learning_rate": config["learning_rate"],
        "n_estimators": int(config["n_estimators"]),
        "min_child_weight": int(config["min_child_weight"]),
        "gamma": config["gamma"],
        "subsample": config["subsample"],
        "colsample_bytree": config["colsample_bytree"],
    }

    model = xgb.XGBClassifier(**param)

    # Retrieve datasets from the Ray object store
    x_train = ray.get(x_train_ref)
    y_train = ray.get(y_train_ref)
    x_valid = ray.get(x_valid_ref)
    y_valid = ray.get(y_valid_ref)

    # Train the model and measure the training time
    model.fit(x_train, y_train)
    preds = model.predict(x_valid)
    accuracy = accuracy_score(y_valid, preds)

    # Report accuracy back to Ray Tune
    train.report({"mean_accuracy": accuracy})


# Define the search space for hyperparameter tuning
search_space = {
    "learning_rate": tune.uniform(0.01, 0.3),
    "n_estimators": tune.randint(100, 1000),
    "min_child_weight": tune.randint(1, 10),
    "gamma": tune.uniform(1e-8, 1.0),
    "subsample": tune.uniform(0.5, 1.0),
    "colsample_bytree": tune.uniform(0.5, 1.0),
}

# Define the Optuna search algorithm for hyperparameter tuning
algo = OptunaSearch()

# Limit the number of concurrent trials to avoid excessive resource usage
algo = ConcurrencyLimiter(algo, max_concurrent=94)

# Define the ASHAScheduler for early stopping of underperforming trials
scheduler = ASHAScheduler(max_t=1000, grace_period=1, reduction_factor=2)


# Function to create directory names for each trial
def trial_dirname_creator(trial):
    return f"trial_{trial.trial_id}"


# Define the Tuner for Ray Tune
tuner = tune.Tuner(
    objective,
    tune_config=tune.TuneConfig(
        metric="mean_accuracy",
        mode="max",
        search_alg=algo,
        scheduler=scheduler,
        num_samples=1000,  # Number of trials
        trial_dirname_creator=trial_dirname_creator,
    ),
    param_space=search_space,
)

# Run the hyperparameter tuning
results = tuner.fit()

# Extract the best result from the tuning process
best_result = results.get_best_result(metric="mean_accuracy", mode="max")
best_config = best_result.config
best_checkpoint = best_result.checkpoint

# Get a dataframe of all results
df = results.get_dataframe()

0,1
Current time:,2024-07-25 09:19:45
Running for:,00:33:31.91
Memory:,8.0/184.9 GiB

Trial name,status,loc,colsample_bytree,gamma,learning_rate,min_child_weight,n_estimators,subsample,acc,iter,total time (s)
objective_d0d9249f,TERMINATED,172.17.0.2:5826,0.655013,0.785799,0.203522,7,896,0.997593,0.833967,1,4.65028
objective_e4700b20,TERMINATED,172.17.0.2:5936,0.61817,0.687488,0.157856,9,636,0.592993,0.83479,1,5.72164
objective_4a5ea7fe,TERMINATED,172.17.0.2:8281,0.543959,0.898814,0.0865273,9,505,0.541924,0.832107,1,4.73788
objective_ee84bab1,TERMINATED,172.17.0.2:8466,0.669861,0.7892,0.16336,7,142,0.614009,0.880591,1,1.39612
objective_51339496,TERMINATED,172.17.0.2:8553,0.963036,0.912217,0.0881904,9,108,0.57357,0.864737,1,1.11483
objective_8d539c9d,TERMINATED,172.17.0.2:8642,0.50699,0.630193,0.0668971,2,723,0.655598,0.832749,1,6.49528
objective_460ef6cb,TERMINATED,172.17.0.2:8736,0.951994,0.667301,0.0944987,5,780,0.529233,0.892988,1,7.33249
objective_dd32d121,TERMINATED,172.17.0.2:8821,0.725452,0.981157,0.0169847,2,851,0.891317,0.873874,1,6.6393
objective_affadd61,TERMINATED,172.17.0.2:8906,0.913992,0.661615,0.196938,8,982,0.89767,0.898372,1,7.21996
objective_a9d39fe8,TERMINATED,172.17.0.2:8993,0.832097,0.0415669,0.103564,9,819,0.512545,0.893894,1,7.43288


2024-07-25 09:19:45,513	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/objective_2024-07-25_08-46-13' in 0.8622s.
2024-07-25 09:19:45,705	INFO tune.py:1041 -- Total run time: 2012.12 seconds (2011.04 seconds for the tuning loop).


In [2]:
df.to_csv("results.csv")

In [3]:
best_config

{'learning_rate': 0.293064528332077,
 'n_estimators': 991,
 'min_child_weight': 7,
 'gamma': 0.3438511882472097,
 'subsample': 0.9654982494169886,
 'colsample_bytree': 0.7673390545515997}

In [4]:
import json
with open('best_config.json', 'w') as json_file:
    json.dump(best_config, json_file, indent=4)

In [5]:
param = {
    "verbosity": 0,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "booster": "gbtree",
    "max_depth": 3,
    "learning_rate": best_config["learning_rate"],
    "n_estimators": int(best_config["n_estimators"]),
    "min_child_weight": int(best_config["min_child_weight"]),
    "gamma": best_config["gamma"],
    "subsample": best_config["subsample"],
    "colsample_bytree": best_config["colsample_bytree"],
}

# Train the model
model = xgb.XGBClassifier(**param)
model.fit(x_train, y_train)

In [6]:
preds = model.predict(x_valid)
accuracy = accuracy_score(y_valid, preds)
print(accuracy)

0.9012528605060832


In [7]:
model.save_model('xgboost_model.json')

In [8]:
loaded_model = xgb.XGBClassifier()
loaded_model.load_model('xgboost_model.json')

In [17]:
from PIL import Image
import numpy as np

image_name = 'image-9.jpg'

# Open the image
image = Image.open(image_name)

# Convert the image to RGB (just in case it's in a different mode)
image_rgb = image.convert('RGB')

# Convert the image to a NumPy array
image_array = np.array(image_rgb)

# Reshape the image array to have shape (height * width, 3)
height, width, _ = image_array.shape
pixels = image_array.reshape(-1, 3)

# Classify each pixel
# Predict the class for each pixel
predictions = loaded_model.predict(pixels)

# Recreate the image
# Create a new image array where pixels classified as 1 are set to red
output_array = image_array.copy()
output_array = output_array.reshape(-1, 3)
output_array[predictions == 1] = [255, 0, 0]  # Red color

# Reshape the output array back to the original image shape
output_array = output_array.reshape(height, width, 3)

# Convert the array to an image
output_image = Image.fromarray(output_array.astype("uint8"))

# Save or display the output image
output_image.save(f"output-{image_name}.png")
output_image.show()


display-im6.q16: unable to open X server `' @ error/display.c/DisplayImageCommand/412.
