In [1]:
import time
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
from itertools import repeat, product
import matplotlib.pyplot as plt


import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer



random.seed(42)
%matplotlib inline

# Load the data

In [9]:
twenty_train = fetch_20newsgroups(subset='train', 
                                  shuffle=True,
                                  random_state=42)

In [10]:
X, y = twenty_train.data, twenty_train.target

In [33]:
pd.Data

array([7, 4, 4, ..., 3, 1, 8])

In [23]:
len(twenty_train.target_names)

20

In [11]:
vectorizer = CountVectorizer(lowercase=False, ngram_range=(1,2))

In [12]:
X_vectors = vectorizer.fit_transform(X)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vectors, y, test_size=0.2, random_state=42)

# modeling

### AIR https://docs.ray.io/en/master/ray-air/getting-started.html

In [20]:
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer

https://docs.ray.io/en/master/ray-air/examples/xgboost_example.html

In [None]:
# XGBoost specific params
params = {
    "tree_method": "exact",
    "objective": "multi:softmax",
    "eval_metric": ["merror"],
    "num_class": len(twenty_train.target_names)
}

In [None]:
# need column name for target

In [32]:
num_workers=8
use_gpu=False

trainer = XGBoostTrainer(
     scaling_config=ScalingConfig(num_workers=num_workers, 
                                  use_gpu=use_gpu),
     label_column="target",
     params=params,
     datasets={"train": train_set, "valid": test_set},
    )
result = trainer.fit()

TypeError: __init__() missing 1 required keyword-only argument: 'label_column'

### xgboost_ray

In [34]:
from xgboost_ray import RayDMatrix, RayParams, train


In [35]:
train_set = RayDMatrix(X_train, y_train)
test_set = RayDMatrix(X_test, y_test)

In [38]:
# XGBoost specific params
params = {
    "tree_method": "approx",
    "objective": "multi:softmax",
    "eval_metric": ["merror"],
    "num_class": len(twenty_train.target_names)
}

In [40]:
# issue is sparse matrix

In [39]:
evals_result = {}
bst = train(
    params,
    train_set,
    evals_result=evals_result,
    evals=[(train_set, "train")],
    verbose_eval=False,
    ray_params=RayParams(num_actors=2, cpus_per_actor=1))

ValueError: Unknown data source type: <class 'scipy.sparse._csr.csr_matrix'> with FileType: None.
FIX THIS by passing a supported data type. Supported data types include pandas.DataFrame, pandas.Series, np.ndarray, and CSV/Parquet file paths. If you specify a file, path, consider passing the `filetype` argument to specify the type of the source. Use the `RayFileType` enum for that. If using Modin, Dask, or Petastorm, make sure the library is installed.

# Visualize result

In [None]:
df_result = pd.DataFrame(all_exps).set_index("method")

In [None]:
df_result

In [None]:
df_result.to_csv('./df_result.csv')

In [None]:
df_result['time_result_avg'] = df_result['time_result'].apply(lambda x: np.mean(x))

In [None]:
df_result = df_result.sort_values(by="time_result_avg")

In [None]:
df_result

In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB performance benchmark for {num_exp} iteration (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")
fig = df_result["time_result"].apply(lambda x: pd.Series(x)).T.boxplot(rot=45)
plt.tight_layout()
fig.figure.savefig(f'./img/performance_{num_exp}_n_estimator.png')


In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB average time for {num_exp} (20 Newsgroup data)')
plt.ylabel("elapsed time (sec)")

df_result_t = df_result["time_result"].apply(lambda x: pd.Series(x)).T
fig = df_result_t.reindex(df_result_t.mean().sort_values().index, axis=1).mean().plot(kind='bar', rot=45)

plt.tight_layout()
fig.figure.savefig(f'./img/average_time_{num_exp}.png')



In [None]:
df_result

In [None]:
234/1051.5

In [None]:
# ray https://www.anyscale.com/blog/three-ways-to-speed-up-xgboost-model-training