In [1]:
import re
import xml.etree.cElementTree as ET
regex_float_pattern = r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?'

In [2]:
#thanks to Huilin [https://gist.github.com/hqucms/56844f4d1e04757704f6afcdaa6f65a8]

In [3]:
def build_tree(xgtree, base_xml_element, var_indices):
    parent_element_dict = {'0':base_xml_element}
    pos_dict = {'0':'s'}
    for line in xgtree.split('\n'):
        if not line: continue
        if ':leaf=' in line:
            #leaf node
            result = re.match(r'(\t*)(\d+):leaf=({0})$'.format(regex_float_pattern), line)
            if not result:
                print(line)
            depth = result.group(1).count('\t')
            inode = result.group(2)
            res = result.group(3)
            node_elementTree = ET.SubElement(parent_element_dict[inode], "Node", pos=str(pos_dict[inode]),
                                             depth=str(depth), NCoef="0", IVar="-1", Cut="0.0e+00", cType="1", res=str(res), rms="0.0e+00", purity="0.0e+00", nType="-99")
        else:
            #\t\t3:[var_topcand_mass<138.19] yes=7,no=8,missing=7
            result = re.match(r'(\t*)([0-9]+):\[(?P<var>.+)<(?P<cut>{0})\]\syes=(?P<yes>\d+),no=(?P<no>\d+)'.format(regex_float_pattern),line)
            if not result:
                print(line)
            depth = result.group(1).count('\t')
            inode = result.group(2)
            var = result.group('var')
            cut = result.group('cut')
            lnode = result.group('yes')
            rnode = result.group('no')
            pos_dict[lnode] = 'l'
            pos_dict[rnode] = 'r'
            node_elementTree = ET.SubElement(parent_element_dict[inode], "Node", pos=str(pos_dict[inode]),
                                             depth=str(depth), NCoef="0", IVar=str(var_indices[var]), Cut=str(cut),
                                             cType="1", res="0.0e+00", rms="0.0e+00", purity="0.0e+00", nType="0")
            parent_element_dict[lnode] = node_elementTree
            parent_element_dict[rnode] = node_elementTree

In [4]:
from __future__ import division
import xgboost as xgb 
from xgboost import plot_importance
from xgboost import plot_tree
from xgboost import XGBClassifier

In [5]:
def convert_model(model, input_variables, output_xml):
    NTrees = len(model)
    var_list = input_variables
    var_indices = {}
    
    # <MethodSetup>
    MethodSetup = ET.Element("MethodSetup", Method="BDT::BDT")

    # <Variables>
    Variables = ET.SubElement(MethodSetup, "Variables", NVar=str(len(var_list)))
    for ind, val in enumerate(var_list):
        name = val[0]
        var_type = val[1]
        var_indices[name] = ind
        Variable = ET.SubElement(Variables, "Variable", VarIndex=str(ind), Type=val[1], 
            Expression=name, Label=name, Title=name, Unit="", Internal=name, 
            Min="0.0e+00", Max="0.0e+00")

    # <GeneralInfo>
    GeneralInfo = ET.SubElement(MethodSetup, "GeneralInfo")
    Info_Creator = ET.SubElement(GeneralInfo, "Info", name="Creator", value="xgboost2TMVA")
    Info_AnalysisType = ET.SubElement(GeneralInfo, "Info", name="AnalysisType", value="Classification")

    # <Options>
    Options = ET.SubElement(MethodSetup, "Options")
    Option_NodePurityLimit = ET.SubElement(Options, "Option", name="NodePurityLimit", modified="No").text = "5.00e-01"
    Option_BoostType = ET.SubElement(Options, "Option", name="BoostType", modified="Yes").text = "Grad"
    
    # <Weights>
    Weights = ET.SubElement(MethodSetup, "Weights", NTrees=str(NTrees), AnalysisType="1")
    
    for itree in range(NTrees):
        BinaryTree = ET.SubElement(Weights, "BinaryTree", type="DecisionTree", boostWeight="1.0e+00", itree=str(itree))
        build_tree(model[itree], BinaryTree, var_indices)
        
    tree = ET.ElementTree(MethodSetup)
    tree.write(output_xml)

In [6]:
XGBmodel = './SvB_Training/XGB_Model/C2V_bdt.json'
TMVAmodel = './XGB2TMVA_outModel/C2V_bdt.xml'
xgbc = XGBClassifier()
xgbc.load_model(XGBmodel)
model = xgbc.get_booster().get_dump()
# xgbc = xgbc.get_dump()

In [7]:
# CAT
train_var = [('VHH_H2H1_pt_ratio','F'),('VHH_HH_m','F'), ('selLeptons_pt_0','F'),\
                 ('dilep_dPhi','F'), ('dilep_dEta','F'), ('ptl1OVERptl0','F'),('ptl0OVERV_mass','F'),\
                 ('VHH_V_H2_dPhi','F'), ('VHH_HH_dR','F'), \
                 ('VHH_H1_pT','F'),('V_pt','F'),\
                 ('VHH_H1_BJet_dR','F'), ('VHH_H2_BJet_dR','F') \
            ]

# SvB
# train_var = [('dilep_dPhi','F'),('dilep_dEta','F'), ('ptl1OVERptl0','F'), ('ptl0OVERV_mass','F'),\
#              ('VHH_Vreco4j_HT','F'), ('VHH_HH_dR','F'), ('VHH_V_HH_pT_Ratio','F'),('V_mass','F'),\
#              ('VHH_H1_pT','F'), ('VHH_HH_pT','F'), ('V_pt','F'), \
#              ('VHH_H1_m','F'),('VHH_HH_m','F'), \
#              ('VHH_H1_e','F'),('VHH_HH_e','F'), \
#              ('VHH_V_H1_dPhi','F'), ('VHH_V_HH_dPhi','F'), ('VHH_HH_deta','F'), \
#              ('No3_btag_pt','F'), ('No4_btag_pt','F') \
#             ]

# RwT
#DY
# train_var = [('V_mass','F'),('VHH_H1_m','F'), ('VHH_Vreco4j_HT','F'), ('VHH_HH_m','F'),\
#              ('VHH_HH_pT','F'), ('No3_btag_pt','F')\
#             ]

# TT/TTB
# train_var = [('V_mass','F'),('VHH_H1_m','F'), ('VHH_Vreco4j_HT','F'), ('VHH_HH_m','F'),\
#              ('VHH_HH_pT','F'), ('No3_btag_pt','F'), ('VHH_H1_pT','F'),('VHH_V_HH_pT_Ratio','F')\
#             ]



In [8]:
convert_model(model,train_var,output_xml=TMVAmodel)