In [1]:
import time
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
from itertools import repeat, product
import matplotlib.pyplot as plt


import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


import joblib
from ray.util.joblib import register_ray


random.seed(42)
%matplotlib inline

# Load the data

In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [3]:
len(twenty_train.data)

11314

In [4]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
X, y = twenty_train.data, twenty_train.target

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# modeling

### sklearn with Ray backend

In [None]:
import ray
ray.shut

In [None]:
%time
register_ray()

In [None]:
num_exp = 50
# since these two were the fastest when using multi-processing
t = "exact"
n = -1

all_exps = []


exp_time = []
exp_score = []
experiment_dict ={}
experiment_dict["method"] = f"ray_backend n_job={n}, tree_method={t}"   

print(f"n_job={n}, tree_method={t} x {num_exp} times")

for i in tqdm(repeat(1, num_exp), total=num_exp):

    xgb_model = xgb.XGBClassifier(n_jobs=n,
                                  tree_method=t,
                                  n_estimators=100, 
                                  random_state=42
                                 )


    text_clf = Pipeline([
        ('vect', CountVectorizer(lowercase=False, ngram_range=(1,2))),
        ('clf', xgb_model)
    ])

    with joblib.parallel_backend('ray'):        
        start = time.time()
        text_clf.fit(X_train, y_train)
        end = time.time()
    exp_time.append(end-start)

    y_pred = text_clf.predict(X_test)
    score = f1_score(y_test, y_pred, average='macro')
    exp_score.append(score)
    # print(score)

experiment_dict["time_result"] = exp_time
experiment_dict["average_score"] = exp_score

all_exps.append(experiment_dict)

# Visualize result

In [None]:
df_result = pd.DataFrame(all_exps).set_index("method")

In [None]:
df_result = df_result.reset_index()

In [None]:
# df_old = pd.read_csv('../output/df_result.csv')

# df_old['time_result'] = df_old['time_result'].apply(literal_eval)

# df_result = pd.concat([df_old, df_result])

In [None]:
df_result['time_result_avg'] = df_result['time_result'].apply(lambda x: np.mean(x))

In [None]:
df_result = df_result.sort_values(by="time_result_avg")

In [None]:
df_result.to_csv('../output/df_result_joblib_ray_29012023.csv')

In [None]:
df_result.head(2)

In [None]:
218.70/233.83

In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB performance benchmark for {num_exp} iteration (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")
fig = df_result["time_result"].head(2).apply(lambda x: pd.Series(x)).T.boxplot(rot=45)
plt.tight_layout()
fig.figure.savefig(f'../img/performance_{num_exp}_joblib_ray.png')


In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB average time for {num_exp} (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")

df_result_t = df_result["time_result"].head(2).apply(lambda x: pd.Series(x)).T
fig = df_result_t.reindex(df_result_t.mean().sort_values().index, axis=1).mean().plot(kind='bar', rot=45)

plt.tight_layout()
fig.figure.savefig(f'../img/average_time_{num_exp}.png')



In [None]:
# ray https://www.anyscale.com/blog/three-ways-to-speed-up-xgboost-model-training