In [None]:
# import time
import random
import pandas as pd
import numpy as np
import gc
# from tqdm import tqdm
# import multiprocessing
from itertools import repeat, product
import matplotlib.pyplot as plt


# import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
# from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

random.seed(42)
gc.enable()
%matplotlib inline

# Load the data

In [None]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [None]:
len(twenty_train.data)

In [None]:
twenty_train.target_names

# modeling

### sklearn with Ray backend

In [None]:
import ray
import joblib
from ray.util.joblib import register_ray

In [None]:
X, y = twenty_train.data, twenty_train.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
%time
register_ray()

In [None]:
num_exp = 50
# since these two were the fastest when using multi-processing
t = "exact"
n = -1

all_exps = []


exp_time = []
exp_score = []
experiment_dict ={}
experiment_dict["method"] = f"ray_backend n_job={n}, tree_method={t}"   

print(f"n_job={n}, tree_method={t} x {num_exp} times")

for i in tqdm(repeat(1, num_exp), total=num_exp):

    xgb_model = xgb.XGBClassifier(n_jobs=n,
                                  tree_method=t,
                                  n_estimators=100, 
                                  random_state=42
                                 )


    text_clf = Pipeline([
        ('vect', CountVectorizer(lowercase=False, ngram_range=(1,2))),
        ('clf', xgb_model)
    ])

    with joblib.parallel_backend('ray'):        
        start = time.time()
        text_clf.fit(X_train, y_train)
        end = time.time()
    exp_time.append(end-start)

    y_pred = text_clf.predict(X_test)
    score = f1_score(y_test, y_pred, average='macro')
    exp_score.append(score)
    # print(score)

experiment_dict["time_result"] = exp_time
experiment_dict["average_score"] = exp_score

all_exps.append(experiment_dict)

# Ray AI Runtimr (AIR)

In [5]:
import ray
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer
from ray.data.preprocessors import BatchMapper, Chain, CountVectorizer

In [6]:
ray.shutdown()

In [7]:
ray.init(num_cpus=8)

2023-02-07 16:06:53,328	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.9.16
Ray version:,2.2.0
Dashboard:,http://127.0.0.1:8265


In [17]:
# Create dask dataframe 
df = pd.DataFrame({"text": twenty_train.data, "target": twenty_train.target})

In [18]:
ds = ray.data.from_pandas(df)

In [19]:
# Split data into train and validation.
train_dataset, valid_dataset = ds.train_test_split(test_size=0.2, seed=42)

In [20]:
def fix_col_names(batch: pd.DataFrame) -> pd.DataFrame:
    batch.columns = [
        colname.replace(":", "COLON")
        .replace(",", "COMMA")
        .replace("<", "LT")
        .replace(">", "GT")
        .replace("[", "LBRACKET")
        .replace("]", "RBRACKET")
        for colname in batch.columns
    ]
    return batch


preprocessor = Chain(   
    CountVectorizer(columns=["text"]), 
    BatchMapper(fix_col_names, batch_format="pandas")
)

In [21]:
# XGBoost specific params
params = {
    "tree_method": "approx",
    "objective": "multi:softmax",
    "eval_metric": ["merror"],
    "num_class": df["target"].nunique(),
}

In [22]:
trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(num_workers=2, use_gpu=False),
    label_column="target",
    params=params,
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=preprocessor,
    num_boost_round=100,
)

In [23]:
result = trainer.fit()

0,1
Current time:,2023-02-07 16:09:42
Running for:,00:00:31.48
Memory:,14.4/16.0 GiB

Trial name,# failures,error file
XGBoostTrainer_6e735_00000,1,/Users/Amiros/ray_results/XGBoostTrainer_2023-02-07_16-09-11/XGBoostTrainer_6e735_00000_0_2023-02-07_16-09-11/error.txt

Trial name,status,loc
XGBoostTrainer_6e735_00000,ERROR,127.0.0.1:41186


2023-02-07 16:09:42,598	ERROR trial_runner.py:1088 -- Trial XGBoostTrainer_6e735_00000: Error processing event.
ray.exceptions.RayTaskError(IndexError): [36mray::_Inner.train()[39m (pid=41186, ip=127.0.0.1, repr=XGBoostTrainer)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 367, in train
    raise skipped from exception_cause(skipped)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 335, in entrypoint
    return self._trainable_func(
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/base_trainer.py", line 480, in _trainable_func
    super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 652, in _trainable_func
    output = fn()
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/base_trainer.py", line 389, in t

Trial name,date,experiment_id,hostname,node_ip,pid,timestamp,trial_id
XGBoostTrainer_6e735_00000,2023-02-07_16-09-13,6a286d1bc8c44ba0953e4d6d81037cf8,amirs-MacBook-Pro.local,127.0.0.1,41186,1675811353,6e735_00000


2023-02-07 16:09:42,720	ERROR tune.py:758 -- Trials did not complete: [XGBoostTrainer_6e735_00000]
2023-02-07 16:09:42,721	INFO tune.py:762 -- Total run time: 31.61 seconds (31.48 seconds for the tuning loop).


RayTaskError(IndexError): [36mray::_Inner.train()[39m (pid=41186, ip=127.0.0.1, repr=XGBoostTrainer)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 367, in train
    raise skipped from exception_cause(skipped)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 335, in entrypoint
    return self._trainable_func(
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/base_trainer.py", line 480, in _trainable_func
    super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/tune/trainable/function_trainable.py", line 652, in _trainable_func
    output = fn()
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/base_trainer.py", line 389, in train_func
    trainer.preprocess_datasets()
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/gbdt_trainer.py", line 187, in preprocess_datasets
    super().preprocess_datasets()
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/train/base_trainer.py", line 299, in preprocess_datasets
    self.preprocessor.fit(train_dataset)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessor.py", line 105, in fit
    return self._fit(dataset)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessors/chain.py", line 73, in _fit
    ds = preprocessor.fit_transform(ds)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessor.py", line 120, in fit_transform
    self.fit(dataset)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessor.py", line 105, in fit
    return self._fit(dataset)
  File "/opt/anaconda3/envs/dask/lib/python3.9/site-packages/ray/data/preprocessors/vectorizer.py", line 233, in _fit
    total_counts[i].update(col_value_counts)
IndexError: list index out of range

In [16]:
result.metrics['time_total_s']

74.53868389129639

In [None]:
num_exp = 2

all_exps = []


exp_time = []
exp_score = []
experiment_dict ={}
experiment_dict["method"] = f"ray_AIR n_job={num_exp}, tree_method='approx'"   


for i in repeat(1, num_exp):
    result = trainer.fit()
# print(result.metrics)

   
    exp_time.append(result.metrics['time_total_s'])

experiment_dict["time_result"] = exp_time


In [None]:
experiment_dict

### XGBoost_ray

https://xgboost.readthedocs.io/en/stable/tutorials/ray.html

In [None]:
from xgboost_ray import RayDMatrix, RayParams, train

In [None]:
vect = CountVectorizer(lowercase=False, ngram_range=(1,2))

In [None]:
X = vect.fit_transform(df['text']).todense()

In [None]:
train_set = RayDMatrix(np.squeeze(np.asarray(X)), df['target'].to_numpy())

In [None]:
evals_result = {}
bst = train(
    {
        "objective": "multi:softmax",
        "eval_metric": ["merror"],
    },
    train_set,
    evals_result=evals_result,
    evals=[(train_set, "train")],
    verbose_eval=False,
    ray_params=RayParams(num_actors=8, cpus_per_actor=1))

bst.save_model("model.xgb")
print("Final training error: {:.4f}".format(
    evals_result["train"]["error"][-1]))

# Visualize result

In [None]:
## if you ran the notebook
# df_result = pd.DataFrame(all_exps).set_index("method")
# df_result = df_result.reset_index()

# if you are loading the result dataframe from file
import ast

df_result = pd.read_csv('../output/df_result_joblib_ray_29012023.csv')
df_result['time_result'] = df_result['time_result'].apply(lambda x: ast.literal_eval(x))

In [None]:
df_result['average_score'][0]

In [None]:
df_result['time_result_avg'] = df_result['time_result'].apply(lambda x: np.mean(x))

In [None]:
df_result = df_result.sort_values(by="time_result_avg")

In [None]:
df_result = df_result.head(2)

In [None]:
num_exp = 50

fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB performance benchmark for {num_exp} iterations (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")
fig = df_result["time_result"].head(2).apply(lambda x: pd.Series(x)).T.boxplot(rot=45)
plt.tight_layout()
fig.figure.savefig(f'../img/performance_{num_exp}_joblib_ray.png')


In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB average time for {num_exp} (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")

df_result_t = df_result["time_result"].head(2).apply(lambda x: pd.Series(x)).T
fig = df_result_t.reindex(df_result_t.mean().sort_values().index, axis=1).mean().plot(kind='bar', rot=45)

plt.tight_layout()
# fig.figure.savefig(f'../img/average_time_{num_exp}.png')



In [None]:
# ray https://www.anyscale.com/blog/three-ways-to-speed-up-xgboost-model-training