In [1]:
import time
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
from itertools import repeat, product
import matplotlib.pyplot as plt


import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


random.seed(123)
%matplotlib inline

# Load the data

In [2]:
df = pd.read_parquet('./data/df_labeled.parquet')
df.shape

(20978, 8)

In [3]:
df.head()

Unnamed: 0_level_0,context_uid,text,text_embedding,text_truncated,url,preds,probs,preds_str
__DATAPOINT_UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
doc::0,0,\n\n EXCLUSIVE TECHNICAL CONSULTI...,"[-0.2118404507637024, -0.26993510127067566, 0....",\n\n EXCLUSIVE TECHNICAL CONSULTI...,https://link.snorkel-ai.com/DkB7JA,2,"[8.081812848104164e-05, 0.0008421743987128139,...",services
doc::1,1,\n\n ITOWNET ELECTRONIC ...,"[-0.09042458236217499, -0.10253632813692093, 0...",\n\n ITOWNET ELECTRONIC ...,https://link.snorkel-ai.com/55ftNU,2,"[0.00037732310011051595, 0.0028201169334352016...",services
doc::10,10,"\nEMPLOYMENT AGREEMENT\nThis Agreement (the ""A...","[0.20291098952293396, -0.4462553560733795, 0.4...","\nEMPLOYMENT AGREEMENT\nThis Agreement (the ""A...",https://link.snorkel-ai.com/CRcQnu,0,"[0.9999843835830688, 2.716194103413727e-06, 1....",employment
doc::100,100,\n\n STOCK PURCHASE...,"[0.010273497551679611, -0.09152742475271225, 0...",\n\n STOCK PURCHASE...,,3,"[9.5067844085861e-06, 1.5301578969229013e-05, ...",stock
doc::1000,1000,\n\n STATEMENT OF TERMS AND CO...,"[0.408515989780426, -0.34444373846054077, 0.17...",\n\n STATEMENT OF TERMS AND CO...,https://link.snorkel-ai.com/5B66Lq,0,"[0.9997205138206482, 1.5678218915127218e-05, 0...",employment


In [4]:
# df['label'] = [ random.randint(1,5)  for k in df.index]
# df['label'].value_counts()

In [5]:
df = df.dropna(subset=['preds'])
df.shape

(20978, 8)

In [6]:
X, y = df['text'].head(100), df['preds'].head(100)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123)

In [8]:
from xgboost_ray import RayDMatrix, RayParams, train


In [10]:
train_set = RayDMatrix(X_train, y_train)
test_set = RayDMatrix(X_test, y_test)

# modeling

In [None]:
n_jobs = [1, 
          multiprocessing.cpu_count() // 2,
          -1]

tree_method = ["exact", "hist"] # https://xgboost.readthedocs.io/en/latest/treemethod.html#approximated-solutions

param_space = list(product(n_jobs, tree_method))

In [None]:
num_exp = 100
all_exps = []


for n, t in param_space:
    exp_time = []
    exp_score = []
    experiment_dict ={}
    experiment_dict["method"] = f"n_job={n}, tree_method={t}"   
    
    print(f"n_job={n}, tree_method={t} x {num_exp} times")
    
    for i in tqdm(repeat(1, num_exp), total=num_exp):
        
        xgb_model = xgb.XGBClassifier(n_jobs=n,
                                     tree_method='exact',
                                     n_estimators=100, 
                                     random_state=123)
        
        
        text_clf = Pipeline([
            ('vect', CountVectorizer(lowercase=False, ngram_range=(1,2))),
            ('clf', xgb_model)
        ])

        start = time.time()
        text_clf.fit(X_train, y_train)
        end = time.time()
        exp_time.append(end-start)
        
        y_pred = text_clf.predict(X_test)
        score = f1_score(y_test, y_pred, average='macro')
        exp_score.append(score)
        # print(score)
    
    experiment_dict["time_result"] = exp_time
    experiment_dict["average_score"] = np.mean(exp_score)
    
    all_exps.append(experiment_dict)

# Visualize result

In [None]:
df_result = pd.DataFrame(all_exps).set_index("method")

In [None]:
df_result

In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB performance benchmark for {num_exp} iteration (contract classification data)')
plt.ylabel("elapsed time (sec)")
fig = df_result["time_result"].apply(lambda x: pd.Series(x)).T.boxplot(rot=45)
plt.tight_layout()
fig.figure.savefig('./img/performance.png')


In [None]:
fig = plt.figure(figsize=(10, 10))
plt.title(f'XGB average time for {num_exp} (contract classification data)')
plt.ylabel("elapsed time (sec)")

df_result_t = df_result["time_result"].apply(lambda x: pd.Series(x)).T
fig = df_result_t.reindex(df_result_t.mean().sort_values().index, axis=1).mean().plot(kind='bar', rot=45)

plt.tight_layout()
fig.figure.savefig('./average_time.png')



In [None]:
# sparse matrix https://stats.stackexchange.com/questions/229111/improving-the-speed-of-xgboost-cv
# ray https://www.anyscale.com/blog/three-ways-to-speed-up-xgboost-model-training

