In [1]:
import time
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
from itertools import repeat, product
import matplotlib.pyplot as plt


import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer



random.seed(42)
%matplotlib inline

# Load the data

In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [3]:
len(twenty_train.data)

11314

In [4]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# modeling

In [5]:
X, y = twenty_train.data, twenty_train.target

In [6]:
vectorizer = CountVectorizer(lowercase=False, ngram_range=(1,2))

X = vectorizer.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [8]:
n_jobs = [
    multiprocessing.cpu_count() // 2,
    -1,
    1
]

tree_method = ["exact", 
               "hist"
              ] # https://xgboost.readthedocs.io/en/latest/treemethod.html#approximated-solutions

param_space = list(product(n_jobs, tree_method))

In [9]:
param_space

[(4, 'exact'),
 (4, 'hist'),
 (-1, 'exact'),
 (-1, 'hist'),
 (1, 'exact'),
 (1, 'hist')]

In [None]:
num_exp = 1
all_exps = []


for n, t in param_space:
    exp_time = []
    exp_score = []
    experiment_dict ={}
    experiment_dict["method"] = f"n_job={n}, tree_method={t}"   
    
    print(f"n_job={n}, tree_method={t} x {num_exp} times")
    
    for i in tqdm(repeat(1, num_exp), total=num_exp):
        
        xgb_model = xgb.XGBClassifier(n_jobs=n,
                                      tree_method=t,
                                      n_estimators=100, 
                                      random_state=42
                                     )
        
        
        text_clf = Pipeline([
            # ('vect', CountVectorizer(lowercase=False, ngram_range=(1,2))),
            ('clf', xgb_model)
        ])

        start = time.time()
        text_clf.fit(X_train, y_train)
        end = time.time()
        exp_time.append(end-start)
        
        y_pred = text_clf.predict(X_test)
        score = f1_score(y_test, y_pred, average='macro')
        exp_score.append(score)
        # print(score)
    
    experiment_dict["time_result"] = exp_time
    experiment_dict["average_score"] = exp_score
    
    all_exps.append(experiment_dict)

### sklearn with Ray backend

In [10]:
import joblib
from ray.util.joblib import register_ray
register_ray()

ModuleNotFoundError: No module named 'ray'

In [None]:
exp_time = []
experiment_dict ={}
for n_est in n_estimators:
    experiment_dict["method"] = f"ray backend, n_est={n_est}"
    xgb_model = xgb.XGBClassifier(
    #n_jobs=-1,
    tree_method='exact',
    n_estimators=100, 
    random_state=123
    )


    text_clf = Pipeline([
        # ('vect', CountVectorizer(lowercase=False, ngram_range=(1,2))),
        ('clf', xgb_model)
    ])
    for i in tqdm(repeat(1, num_exp), total=num_exp):
        with joblib.parallel_backend('ray'):        
            start = time.time()
            text_clf.fit(X_train, y_train)
            end = time.time()
        exp_time.append(end-start)
    experiment_dict["time_result"] = exp_time

In [None]:
all_exps.append(experiment_dict)

### Ray trainer

In [None]:
import ray
# from ray.air.config import ScalingConfig
from ray.train.xgboost impor                         t XGBoostTrainer
# from ray.data.preprocessors import CountVectorizer

In [None]:
df = df.dropna(subset=['preds'])
df.shape

In [None]:
X, y = df['text'], df['preds']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123)

In [None]:
df_ray = ray.data.read_parquet("./data/df_labeled.parquet")

df_ray = df_ray.drop_columns(cols=['context_uid',
                                          'text_embedding', 
                                          'text_truncated', 
                                          'url',
                                          'probs',
                                          'preds_str'])

In [None]:
train_dataset, valid_dataset = df_ray.train_test_split(test_size=0.3, seed=123)


In [None]:
train_dataset = ray.data.from_pandas(X)

In [None]:
num_workers=2
# XGBoost specific params
params = {
    "tree_method": "exact",
    "objective": "multi:softprob",
    "eval_metric": "merror",
}

preprocessor = CountVectorizer()

trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=False),
    label_column="preds",
    params=params,
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=vectorizer,
    num_boost_round=100,
)

# Visualize result

In [None]:
df_result = pd.DataFrame(all_exps).set_index("method")

In [None]:
df_result

In [None]:
df_result.to_csv('./df_result.csv')

In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB performance benchmark for {num_exp} iteration (contract classification data)')
plt.ylabel("elapsed time (sec)")
fig = df_result["time_result"].apply(lambda x: pd.Series(x)).T.boxplot(rot=45)
plt.tight_layout()
fig.figure.savefig(f'./img/performance_{num_exp}_n_estimator.png')


In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB average time for {num_exp} (contract classification data)')
plt.ylabel("elapsed time (sec)")

df_result_t = df_result["time_result"].apply(lambda x: pd.Series(x)).T
fig = df_result_t.reindex(df_result_t.mean().sort_values().index, axis=1).mean().plot(kind='bar', rot=45)

plt.tight_layout()
fig.figure.savefig(f'./average_time_{num_exp}.png')



In [None]:
# ray https://www.anyscale.com/blog/three-ways-to-speed-up-xgboost-model-training

