# XGBoost vs LightGBM

In this notebook we collect the results from all the experiments and reports the comparative difference between XGBoost and LightGBM

In [1]:
import matplotlib.pyplot as plt
import nbformat
import json
from toolz import pipe, juxt
import pandas as pd
import seaborn
from toolz import curry

from bokeh.io import show, output_notebook
from bokeh.charts import Bar
from bokeh.models.renderers import GlyphRenderer
from bokeh.models.glyphs import Rect
from toolz import curry
from bokeh.io import export_svgs
from IPython.display import SVG, display

%matplotlib inline 

The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.

  warn(message)


In [2]:
output_notebook()

We are going to read the results from the following notebooks

In [3]:
notebooks = {
    'Airline':'01_airline.ipynb',
    'BCI': '02_BCI.ipynb',
    'Football': '03_football.ipynb',
    'Amazon': '04_PlanetKaggle.ipynb',
    'Fraud': '05_FraudDetection.ipynb',
    'Airline_GPU': '06_airline_GPU.ipynb',
    'HIGGS_GPU': '07_HIGGS_GPU.ipynb'
}

In [4]:
def read_notebook(notebook_name):
    with open(notebook_name) as f:
        return nbformat.read(f, as_version=4)

In [5]:
def results_cell_from(nb):
    for cell in nb.cells:
        if cell['cell_type']=='code' and cell['source'].startswith('# Results'):
            return cell

In [6]:
def extract_text(cell):
    return cell['outputs'][0]['text']

In [7]:
@curry
def remove_line_with(match_str, json_string):
    return '\n'.join(filter(lambda x: match_str not in x, json_string.split('\n')))

In [8]:
def process_nb(notebook_name):
    return pipe(notebook_name,
                read_notebook,
                results_cell_from,
                extract_text,
                remove_line_with('total RAM usage'),
                json.loads)

Here we collect the results from all the exeperiment notebooks. The method simply searches the notebooks for a cell that starts with # Results. It then reads that cells output in as JSON.

In [9]:
results = {nb_key:process_nb(nb_name) for nb_key, nb_name in notebooks.items()}

In [10]:
results

{'Airline': {'lgbm': {'performance': {'AUC': 0.8085689434502803,
    'Accuracy': 0.7353896817753379,
    'F1': 0.6940196231335755,
    'Precision': 0.7695546060118544,
    'Recall': 0.6319874302870966},
   'test_time': 14.591926063993014,
   'train_time': 366.37647559295874},
  'xgb': {'performance': {'AUC': 0.7864092771976574,
    'Accuracy': 0.630096137083639,
    'F1': 0.6981365687005567,
    'Precision': 0.5699044513344648,
    'Recall': 0.9008287536758128},
   'test_time': 12.680251265061088,
   'train_time': 2180.798558813054},
  'xgb_hist': {'performance': {'AUC': 0.8076206419213058,
    'Accuracy': 0.6736735917278169,
    'F1': 0.7170332167054881,
    'Precision': 0.6094592088790932,
    'Recall': 0.8707220218423469},
   'test_time': 14.63406790792942,
   'train_time': 578.727180639049}},
 'Airline_GPU': {'lgbm': {'performance': {'AUC': 0.8337768089671014,
    'Accuracy': 0.759485,
    'F1': 0.7287511489294516,
    'Precision': 0.7825276109281147,
    'Recall': 0.68189061131454

We wish to compare LightGBM and XGBoost both in terms of performance as well as how long they took to train.

In [11]:
def average_performance_diff(dataset):
    lgbm_series = pd.Series(dataset['lgbm']['performance'])
    return 100*((lgbm_series-pd.Series(dataset['xgb']['performance']))/lgbm_series).mean()

In [12]:
def train_time_ratio(dataset):
    return dataset['xgb']['train_time']/dataset['lgbm']['train_time']

def test_time_ratio(dataset):
    return dataset['xgb']['test_time']/dataset['lgbm']['test_time']

In [13]:
metrics = juxt(average_performance_diff, train_time_ratio, test_time_ratio)
res_per_dataset = {dataset_key:metrics(dataset) for dataset_key, dataset in results.items()}

In [14]:
results_df = pd.DataFrame(res_per_dataset, index=['Perf. Difference(%)', 
                                                  'Train Time Ratio', 
                                                  'Test Time Ratio']).T

In [15]:
results_df

Unnamed: 0,Perf. Difference(%),Train Time Ratio,Test Time Ratio
Airline,-0.025994,5.952343,0.868991
Airline_GPU,0.930449,3.04532,0.402712
Amazon,15.701026,3.721858,0.778578
BCI,6.250871,2.503829,0.711396
Football,-0.589916,3.909239,0.98306
Fraud,-1.076624,6.045945,1.060074
HIGGS_GPU,-0.115563,3.72319,0.228053


In [16]:
results_gpu = results_df.loc[['Airline_GPU','HIGGS_GPU']]
results_cpu = results_df[~results_df.index.isin(['Airline_GPU','HIGGS_GPU'])]

Plot of train time ratio for CPU experiments.

In [17]:
data = {
    'Train Time Ratio': results_cpu['Train Time Ratio'].values,
    'label': results_cpu.index.values
}

In [18]:
bar = Bar(data, values='Train Time Ratio', agg='mean', label=['label'], 
          plot_width=600, plot_height=400, bar_width=0.7, color='#5975a4')
bar.axis[0].axis_label=''
bar.axis[1].axis_label='Train Time Ratio (XGBoost/LightGBM)'
bar.axis[1].axis_label_text_font_size='12pt'
bar.y_range = Range1d(0, 8)
bar.toolbar_location='above'
bar.legend[0].visible=False
show(bar)

NameError: name 'Range1d' is not defined

In [None]:
bar.output_backend = "svg"
export_svgs(bar, filename="xgb_vs_lgbm_train_time.svg")
display(SVG('xgb_vs_lgbm_train_time.svg'))

Plot of train time ratio for GPU experiments.

In [None]:
data = {
    'Train Time Ratio': results_gpu['Train Time Ratio'].values,
    'label': results_gpu.index.values
}

In [None]:
bar = Bar(data, values='Train Time Ratio', agg='mean', label=['label'],
          plot_width=300, plot_height=400, bar_width=0.5, color='#ffcc99')
bar.axis[0].axis_label=''
bar.y_range = Range1d(0, 8)
bar.axis[1].axis_label='Train Time Ratio (XGBoost/LightGBM)'
bar.axis[1].axis_label_text_font_size='12pt'
bar.toolbar_location='above'
bar.legend[0].visible=False
show(bar)

In [None]:
bar.output_backend = "svg"
export_svgs(bar, filename="xgb_vs_lgbm_train_time_gpu.svg")
display(SVG('xgb_vs_lgbm_train_time_gpu.svg'))

In [None]:
data = {
    'Perf. Difference(%)': results_df['Perf. Difference(%)'].values,
    'label': results_df.index.values
}

In [None]:
bar = Bar(data, values='Perf. Difference(%)', agg='mean', label=['label'], 
          plot_width=600, plot_height=400, bar_width=0.7, color='#5975a4')
bar.axis[0].axis_label=''
bar.axis[1].axis_label='Perf. Difference(%)'
bar.toolbar_location='above'
bar.legend[0].visible=False
show(bar)

In [None]:
bar.output_backend = "svg"
export_svgs(bar, filename="xgb_vs_lgbm_performance.svg")
display(SVG('xgb_vs_lgbm_performance.svg'))

From the table as well as the plots below we can see that overall the difference in performance is quite small. LightGBM though is 2 to over 10 times quicker than XGBoost.