In [None]:
# import time
import random
import pandas as pd
import numpy as np
import gc
# from tqdm import tqdm
# import multiprocessing
# from itertools import repeat, product
import matplotlib.pyplot as plt


# import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
# from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

random.seed(42)
gc.enable()
%matplotlib inline

# Load the data

In [None]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [None]:
len(twenty_train.data)

In [5]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# modeling

### sklearn with Ray backend

In [None]:
import ray
import joblib
from ray.util.joblib import register_ray

In [None]:
X, y = twenty_train.data, twenty_train.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
%time
register_ray()

In [None]:
num_exp = 50
# since these two were the fastest when using multi-processing
t = "exact"
n = -1

all_exps = []


exp_time = []
exp_score = []
experiment_dict ={}
experiment_dict["method"] = f"ray_backend n_job={n}, tree_method={t}"   

print(f"n_job={n}, tree_method={t} x {num_exp} times")

for i in tqdm(repeat(1, num_exp), total=num_exp):

    xgb_model = xgb.XGBClassifier(n_jobs=n,
                                  tree_method=t,
                                  n_estimators=100, 
                                  random_state=42
                                 )


    text_clf = Pipeline([
        ('vect', CountVectorizer(lowercase=False, ngram_range=(1,2))),
        ('clf', xgb_model)
    ])

    with joblib.parallel_backend('ray'):        
        start = time.time()
        text_clf.fit(X_train, y_train)
        end = time.time()
    exp_time.append(end-start)

    y_pred = text_clf.predict(X_test)
    score = f1_score(y_test, y_pred, average='macro')
    exp_score.append(score)
    # print(score)

experiment_dict["time_result"] = exp_time
experiment_dict["average_score"] = exp_score

all_exps.append(experiment_dict)

# Ray AI Runtimr (AIR)

In [6]:
import ray
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer
# import ray.data.preprocessors 

In [7]:
ray.init(num_cpus=8)

2023-02-05 12:31:34,595	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.9.16
Ray version:,2.2.0
Dashboard:,http://127.0.0.1:8265


In [8]:
# Create dask dataframe 
df = pd.DataFrame({"text": twenty_train.data[:100], "target": twenty_train.target[:100]})

In [None]:
vect = CountVectorizer(lowercase=False, ngram_range=(1,2))

X = vect.fit_transform(df['text'])

In [None]:
X.todense()

In [None]:
count_vect_df = pd.DataFrame(X.todense(), columns=vect.get_feature_names_out())

In [None]:
count_vect_df = pd.concat([count_vect_df, df['target']], axis=1)

In [9]:
ds = ray.data.from_pandas(df)

In [10]:
def transform(df):
    vect = CountVectorizer(lowercase=False, ngram_range=(1,2))

    X = vect.fit_transform(df['text'])
    count_vect_df = pd.DataFrame(X.todense(), columns=vect.get_feature_names_out())
    
    df = pd.concat([count_vect_df, df['target']], axis=1)
    
    return df

In [11]:
ds.show(1)

{'text': "From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n", 'target': 7}


In [12]:
ds = ds.map_batches(transform)

Map_Batches: 100%|████████████████████████████████| 1/1 [00:01<00:00,  1.86s/it]


In [13]:
# Split data into train and validation.
train_dataset, valid_dataset = ds.train_test_split(test_size=0.2, seed=42)

In [14]:
train_dataset

VBox(children=(HTML(value='<h2>Dataset</h2>'), Tab(children=(HTML(value='<div class="scrollableTable jp-Render…

In [15]:
# XGBoost specific params
params = {
    "tree_method": "approx",
    "objective": "multi:softmax",
    "eval_metric": ["merror"],
    "num_class": df['target'].nunique()
}


# preprocessor = ray.data.preprocessors.CountVectorizer(columns=["text"])

trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(num_workers=2, use_gpu=False),
    label_column="target",
    params=params,
    datasets={"train": train_dataset, "valid": valid_dataset},
#     preprocessor=preprocessor,
    num_boost_round=100,
)

In [None]:
result = trainer.fit()
print(result.metrics)

0,1
Current time:,2023-02-05 12:33:40
Running for:,00:01:42.56
Memory:,13.7/16.0 GiB

Trial name,status,loc,iter,total time (s),train-merror,valid-merror
XGBoostTrainer_c1088_00000,RUNNING,127.0.0.1:16575,56,100.472,0,0.75


[2m[36m(_RemoteRayXGBoostActor pid=16588)[0m [12:32:11] task [xgboost.ray]:140563025426176 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=16589)[0m [12:32:11] task [xgboost.ray]:140233796220336 got new rank 0


Trial name,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train-merror,training_iteration,trial_id,valid-merror,warmup_time
XGBoostTrainer_c1088_00000,2023-02-05_12-33-40,False,,80ab218ae21342caa921f491944d8386,amirs-MacBook-Pro.local,56,127.0.0.1,16575,100.472,2.02267,100.472,1675625620,0,,0,56,c1088_00000,0.75,0.0390201


[2m[36m(XGBoostTrainer pid=16575)[0m 2023-02-05 12:32:42,462	INFO main.py:1167 -- Training in progress (31 seconds since last restart).
[2m[36m(XGBoostTrainer pid=16575)[0m 2023-02-05 12:33:12,851	INFO main.py:1167 -- Training in progress (61 seconds since last restart).


# XGB with Dask

https://xgboost.readthedocs.io/en/stable/tutorials/dask.html

https://examples.dask.org/machine-learning/text-vectorization.html

https://examples.dask.org/machine-learning/xgboost.html

In [None]:
import dask_xgboost

import dask.dataframe as dd
from dask.distributed import Client
import dask_ml.feature_extraction.text
from dask_ml.model_selection import train_test_split



In [None]:
client = Client(n_workers=8, threads_per_worker=1, memory_limit='4GB')
client

In [None]:
df = dd.from_pandas(pd.DataFrame({"text": twenty_train.data[:100],
                                  "target": twenty_train.target[:100]}),
                    npartitions=25)

In [None]:
vect = dask_ml.feature_extraction.text.HashingVectorizer()

In [None]:
y = df['target'].to_dask_array(lengths=True)

In [None]:
X = vect.fit_transform(df['text'])

In [None]:
X.compute_chunk_sizes()
y.compute_chunk_sizes()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
params = {'objective': 'multi:softmax',
          'num_class':20}

bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=100)

In [None]:
y_hat = dask_xgboost.predict(client, bst, X_test).persist()
y_hat

In [None]:
y_hat.compute()

In [None]:
y_test, y_hat = dask.compute(y_test, y_hat)

In [None]:
score = f1_score(y_test, y_hat, average='macro')

# Visualize result

In [None]:
## if you ran the notebook
# df_result = pd.DataFrame(all_exps).set_index("method")
# df_result = df_result.reset_index()

# if you are loading the result dataframe from file
import ast

df_result = pd.read_csv('../output/df_result_joblib_ray_29012023.csv')
df_result['time_result'] = df_result['time_result'].apply(lambda x: ast.literal_eval(x))

In [None]:
df_result['average_score'][0]

In [None]:
df_result['time_result_avg'] = df_result['time_result'].apply(lambda x: np.mean(x))

In [None]:
df_result = df_result.sort_values(by="time_result_avg")

In [None]:
df_result = df_result.head(2)

In [None]:
num_exp = 50

fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB performance benchmark for {num_exp} iterations (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")
fig = df_result["time_result"].head(2).apply(lambda x: pd.Series(x)).T.boxplot(rot=45)
plt.tight_layout()
fig.figure.savefig(f'../img/performance_{num_exp}_joblib_ray.png')


In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB average time for {num_exp} (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")

df_result_t = df_result["time_result"].head(2).apply(lambda x: pd.Series(x)).T
fig = df_result_t.reindex(df_result_t.mean().sort_values().index, axis=1).mean().plot(kind='bar', rot=45)

plt.tight_layout()
# fig.figure.savefig(f'../img/average_time_{num_exp}.png')



In [None]:
# ray https://www.anyscale.com/blog/three-ways-to-speed-up-xgboost-model-training