In [1]:
# import time
import gc
import random
import pandas as pd
# import numpy as np
# from tqdm import tqdm
import matplotlib.pyplot as plt


# import xgboost as xgb
# from sklearn.metrics import f1_score
# from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer

random.seed(42)
gc.enable()
%matplotlib inline

# modeling

### Dask_XGB

https://xgboost.readthedocs.io/en/stable/tutorials/dask.html

https://examples.dask.org/machine-learning/text-vectorization.html

https://examples.dask.org/machine-learning/xgboost.html

In [None]:
import dask_xgboost

import dask.dataframe as dd
from dask.distributed import Client
import dask_ml.feature_extraction.text
from dask_ml.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
params = {'objective': 'multi:softmax',
          'num_class':20}

bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=100)

In [None]:
bst.best_iteration()

In [None]:
y_hat = dask_xgboost.predict(client, bst, X_test).persist()
y_hat

In [None]:
y_hat.compute()

In [None]:
y_test, y_hat = dask.compute(y_test, y_hat)

In [None]:
score = f1_score(y_test, y_hat, average='macro')

### XGBoost with SKlearn wrapper on Dask

In [2]:
import xgboost as xgb
import dask.array as da
import dask.distributed
import dask.dataframe as dd
import dask_ml.feature_extraction.text

from distributed import LocalCluster, Client

In [3]:
def load_data():
    twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
    
    df = dd.from_pandas(pd.DataFrame({"text": twenty_train.data,
                                      "target": twenty_train.target}),
                        npartitions=25)
    
    vect = dask_ml.feature_extraction.text.HashingVectorizer(lowercase=False, ngram_range=(1, 2))
#     vect = dask_ml.feature_extraction.text.CountVectorizer()
    
    y = df['target'].to_dask_array(lengths=True)
    X = vect.fit_transform(df['text'])
    
    X.compute_chunk_sizes()
    y.compute_chunk_sizes()
    
    return X, y


def main(X, y, client: Client) -> None:
    clf = xgb.dask.DaskXGBClassifier(n_estimators=100, tree_method="hist")
    clf.client = client  # assign the client
    clf.fit(X, y, eval_set=[(X, y)])
    proba = clf.predict_proba(X)

In [None]:
if __name__ == "__main__":
    
    X, y = load_data()

    with LocalCluster(n_workers=8, threads_per_worker=1) as cluster:
        with Client(cluster) as client:
            main(X, y, client)

INFO:distributed.scheduler:Receive client connection: Client-worker-d87e7c70-ac2f-11ed-9dec-3267941770bc
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53639
INFO:distributed.scheduler:Receive client connection: Client-worker-d87f1a5e-ac2f-11ed-9df0-3267941770bc
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53640
INFO:distributed.scheduler:Receive client connection: Client-worker-d87f7080-ac2f-11ed-9deb-3267941770bc
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53642
INFO:distributed.scheduler:Receive client connection: Client-worker-d87fa51e-ac2f-11ed-9ded-3267941770bc
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53644
INFO:distributed.scheduler:Receive client connection: Client-worker-d87f0f0a-ac2f-11ed-9def-3267941770bc
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:53641
INFO:distributed.scheduler:Receive client connection: Client-worker-d87f3d22-ac2

# Visualize result

In [None]:
## if you ran the notebook
# df_result = pd.DataFrame(all_exps).set_index("method")
# df_result = df_result.reset_index()

# if you are loading the result dataframe from file
import ast

df_result = pd.read_csv('../output/df_result_joblib_ray_29012023.csv')
df_result['time_result'] = df_result['time_result'].apply(lambda x: ast.literal_eval(x))

In [None]:
df_result['average_score'][0]

In [None]:
df_result['time_result_avg'] = df_result['time_result'].apply(lambda x: np.mean(x))

In [None]:
df_result = df_result.sort_values(by="time_result_avg")

In [None]:
df_result = df_result.head(2)

In [None]:
num_exp = 50

fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB performance benchmark for {num_exp} iterations (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")
fig = df_result["time_result"].head(2).apply(lambda x: pd.Series(x)).T.boxplot(rot=45)
plt.tight_layout()
fig.figure.savefig(f'../img/performance_{num_exp}_joblib_ray.png')


In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB average time for {num_exp} (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")

df_result_t = df_result["time_result"].head(2).apply(lambda x: pd.Series(x)).T
fig = df_result_t.reindex(df_result_t.mean().sort_values().index, axis=1).mean().plot(kind='bar', rot=45)

plt.tight_layout()
# fig.figure.savefig(f'../img/average_time_{num_exp}.png')



In [None]:
# ray https://www.anyscale.com/blog/three-ways-to-speed-up-xgboost-model-training