In [None]:
# Add the relevant scripts from LArMachineLearningData
# Nice the process so it can run with lots of cores on low priority
import os

# Add path for LArMachineLearningData
import sys
pandoraMVADir = os.getcwd()
sys.path.append(os.path.join(pandoraMVADir, 'scripts'))

from PandoraBDT import *

# Import relevant SKLearn stuff
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics

# Set global params
testTrainFraction = 0.5
nCores = -1

In [None]:
def get_dune_pfo_characterisation_metadata(bdt_sub_type):
    """
        bdt_sub_type: 'Charge' or 'NoCharge'
    """
    name = "PFOCharBDT"

    features = ['Length',
                'Straight Line Diff Mean',
                'Max Fit Gap Length',
                'Sliding Linear Fit RMS',
                'Vertex Distance',
                'PCA Secondary-Primary EigenValue Ratio',
                'PCA Tertiary-Primary EigenValue Ratio',
                'Hierarchy N Daughters',
                'Hierarchy N Daughter Hits 3D',
                'Hierarchy Daughter Parent Hit Ratio',
                'Opening Angle Diff']
    if bdt_sub_type.lower() == "charge":
        features.append('Charge 1')
        features.append('Charge 2')

        # Set background and signal label names
    params = {
        'labelNames': ['True Shower','True Track'],
        'signalDefs': [0, 1],
        'signalCols': ['r', 'b'],
        'nBins': 100,
        'PlotStep': 1.0,
        'OptimalBinCut': 50,
        'OptimalScoreCut': 0.5,
        'nTrees': 100,
        'TreeDepth': 3,
        'logY': False,
        'figSize': (8, 6),
        'titlesize': 18,
        'labelsize': 14
    }
    return (name, features, params)

def get_dune_vertex_selection_metadata(bdt_sub_type, mode):
    """
        bdt_sub_type: 'Vertex' or 'Region'
        mode: 'beam' or 'atmos'
    """
    name = f"DUNEFD_VertexSelection{bdt_sub_type.title()}"
    
    # Event features
    features = ['Showeryness',
                'Energy',
                'Area',
                'Longitudinality',
                'N Hits',
                'N Clusters',
                'N Candidates']
    
    # Vertex features
    for candidate in [1, 2]:
        if mode.lower() == "beam":
            features.append(f'Beam Deweighting {candidate}')
        features += [f'Energy Kick {candidate}',
                     f'Global Asymmetry {candidate}',
                     f'Local Asymmetry {candidate}',
                     f'Shower Asymmetry {candidate}',
                     f'dEdx Asymmetry {candidate}',
                     f'Vertex Energy {candidate}']
        if bdt_sub_type.lower() == "vertex":
            features.append(f'rPhi {candidate}')
    
    # Shared features
    features += ['Separation',
                 'Axis Hits']
    
    params = {
        'labelNames': ['Background','Vertex'],
        'signalDefs': [0, 1],
        'signalCols': ['r', 'b'],
        'nBins': 100,
        'PlotStep': 1.0,
        'OptimalBinCut': 50,
        'OptimalScoreCut': 0.5,
        'nTrees': 100,
        'TreeDepth': 1,
        'logY': False,
        'figsize': (8, 6),
        'titlesize': 18,
        'labelsize': 14
    }
    
    return (name, features, params)
    

def get_bdt_metadata(experiment, bdt_type, bdt_sub_type=None, mode=None):
    if experiment.lower() == "dune":
        if bdt_type.lower() == "pfocharacterisation":
            return get_dune_pfo_characterisation_metadata(bdt_sub_type)
        elif bdt_type.lower() == "vertexselection":
            return get_dune_vertex_selection_metadata(bdt_sub_type, mode)
    else:
        return ("", [], {})

In [None]:
# Set some analysis specific things
bdt_type = "atmos"
bdt_sub_type = "Region"
trainingFile = os.path.join(os.getcwd(), f'training_files/VertexSelection{bdt_sub_type}.txt')

BDTName, featureNames, params = get_bdt_metadata("DUNE", "VertexSelection", f"{bdt_sub_type}", f"{bdt_type}")

# Create the base BDT to vary the params from and compare to
baseBDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=params['TreeDepth']),algorithm='SAMME', 
                         random_state=42, n_estimators=params['nTrees'])

# Split the data into many subsets to grid search over (Set seed for reproducibility)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [None]:
# Load the data
data, nFeatures, nExamples = LoadData(trainingFile, ',')
featuresOrg, labelsOrg = SplitTrainingSet(data, nFeatures)
features, labels = Randomize(featuresOrg, labelsOrg, True)

# Split into train and test samples
xTrain, yTrain, xTest, yTest = Sample(features, labels, testTrainFraction)

# Split into signal and background based on the true labels
signalFeatures = features[labels==1]
backgroundFeatures = features[labels==0]

# Check the features array is the same size as the feature names array
print (len(featureNames))
print (np.shape(features))
print('Total: '+str(len(features))+', signal: '+
      str(len(signalFeatures))+' and background: '+
      str(len(backgroundFeatures)))

In [None]:
# Construct the Pandas dataframe
# First crete a dictionary
allDict = {featureNames[i]: features[:, i] for i in range(nFeatures)}
allDict.update({'Labels': labels})

# Create the Pandas dataframe, create seperate df for signal/background
df = pd.DataFrame(data=allDict)

In [None]:
# Make plots drawing the variables for signal/background
DrawVariablesDF(df, params)

In [None]:
# Make correlation matricies
dfSig = df[df['Labels']==params['signalDefs'][0]].drop('Labels', axis=1)
dfBck = df[df['Labels']==params['signalDefs'][1]].drop('Labels', axis=1)

CorrelationDF(dfSig, params['labelNames'][0] + ' Correlation Matrix', params)
CorrelationDF(dfBck, params['labelNames'][1] + ' Correlation Matrix', params)

In [None]:
# If we want to make a plot comparing two variables;
xMetric = 'Showeryness'
yMetric = 'Energy Kick 1'

sns.set(font_scale = 1)
sns.jointplot(data=df, x=xMetric, y=yMetric, hue='Labels',
              xlim=(np.quantile(df[xMetric], 0.02), np.quantile(df[xMetric], 0.98)), 
              ylim=(np.quantile(df[yMetric], 0.02), np.quantile(df[yMetric], 0.98)))

In [None]:
# For plotting all combos, not very useful when we have too many variables
#sns.pairplot(df, hue='Labels')

# Optionally drop features from the training set

In [None]:
drop_indices = [ featureNames.index(val) for val in ['Energy', 'N Hits', 'N Candidates'] ]
featureNames = np.delete(featureNames, drop_indices, 0)

In [None]:
# Load the data
data, nFeatures, nExamples = LoadData(trainingFile, ',')
featuresOrg, labelsOrg = SplitTrainingSet(data, nFeatures, drop_indices)
nFeatures = featuresOrg.shape[1]
features, labels = Randomize(featuresOrg, labelsOrg, True)

# Split into train and test samples
xTrain, yTrain, xTest, yTest = Sample(features, labels, testTrainFraction)

# Split into signal and background based on the true labels
signalFeatures = features[labels==1]
backgroundFeatures = features[labels==0]

# Check the features array is the same size as the feature names array
print (len(featureNames))
print (np.shape(features))
print('Total: '+str(len(features))+', signal: '+
      str(len(signalFeatures))+' and background: '+
      str(len(backgroundFeatures)))

# Grid search BDT hyperparameters

In [None]:
depthArray = [1, 2, 3]
estimatorsArray = [50, 100, 200, 400]
bdtArray = np.empty((len(depthArray), len(estimatorsArray)), dtype='object')
for i, depth in enumerate(depthArray):
    baseTree = DecisionTreeClassifier(max_depth=depth)
    for j, estimators in enumerate(estimatorsArray):
        bdtArray[i, j] = AdaBoostClassifier(baseTree, algorithm='SAMME', random_state=42, n_estimators=estimators)
        bdtArray[i, j].fit(xTrain, yTrain)
        PlotBdtKSScores(bdtArray[i, j], xTest, yTest, xTrain, yTrain, 'Vertex Region', params)

# Metrics

In [None]:
baseBDT = bdtArray[0][0]
bestBDT = bdtArray[1][3]

In [None]:
# Plot ROC curves
fig, ax = plt.subplots(figsize=(8,6))
metrics.plot_roc_curve(bestBDT, xTest, yTest, ax=ax, name="Best")
metrics.plot_roc_curve(baseBDT, xTest, yTest, ax=ax, name="Base")

plt.title("ROC Curves")
ax.invert_xaxis()
ax.legend()
ax.grid()
plt.savefig('ROC.pdf')

In [None]:
# Plot Confusion Matricies
fig, ax = plt.subplots(figsize=(8,6))
metrics.plot_confusion_matrix(bestBDT, xTest, yTest, display_labels=params['labelNames'],
                             ax=ax, normalize='true')
ax.invert_xaxis()
#ax.invert_zaxis()
plt.title("Confusion matrix (True Normalised)")
plt.show()
plt.savefig('Confusion.pdf')

In [None]:
# Print more detailed performance info
bdtPredicted = baseBDT.predict(xTest)
gridPredicted = bestBDT.predict(xTest)

print ("Background (0): ", params['labelNames'][0])
print ("Signal (1): ", params['labelNames'][1])
print ("BDT:\n", metrics.classification_report(yTest, bdtPredicted))
print ("Grid:\n", metrics.classification_report(yTest, gridPredicted))

In [None]:
baseBDT = bestBDT

# Further hyperparameter optimisation

In [None]:
# Search performance over training sample size
train_sizes_array = np.linspace(0.5, 1, 6)

train_sizes, train_scores, test_scores = learning_curve(baseBDT, features,
    labels, train_sizes=train_sizes_array[1:], n_jobs=nCores, verbose=9, cv=cv)

mean_train_scores = np.mean(train_scores, axis=1)
mean_test_scores = np.mean(test_scores, axis=1)

std_train_scores = np.std(train_scores, axis=1)
std_test_scores = np.std(test_scores, axis=1)

In [None]:
# Plot training progression
fig, ax = plt.subplots(figsize=(8,6))
plt.title("Training Progression")
plt.xlabel("Number of Training Examples")
plt.ylabel("Score")

plt.plot(train_sizes, mean_train_scores, label='Train Score', color='b')
plt.fill_between(train_sizes, mean_train_scores - std_train_scores,
                         mean_train_scores + std_train_scores, alpha=0.1,
                         color="b")

plt.plot(train_sizes, mean_test_scores, label='Test Score', color='r')
plt.fill_between(train_sizes, mean_test_scores - std_test_scores,
                         mean_test_scores + std_test_scores, alpha=0.1,
                         color="r")
#plt.plot(train_sizes, std_test_scores, label='Test Score Std.', color='k')

plt.grid()
plt.legend()
plt.savefig('TrainingSize.pdf')

In [None]:
# Search over a metric
cppalplhaArray = np.linspace(0, 0.001, 6)

train_scores, test_scores = validation_curve(
    baseBDT, features, labels, param_name='base_estimator__ccp_alpha',
    param_range=cppalplhaArray, n_jobs=nCores, verbose=9, cv=cv)

mean_train_scores = np.mean(train_scores, axis=1)
mean_test_scores = np.mean(test_scores, axis=1)

std_train_scores = np.std(train_scores, axis=1)
std_test_scores = np.std(test_scores, axis=1)

print ("Means: "+str(mean_test_scores)+" and std. "
       +str(std_test_scores))

In [None]:
# Plot grid search
plt.plot(cppalplhaArray, mean_train_scores, label='Train Score', color='b')
plt.fill_between(cppalplhaArray, mean_train_scores - std_train_scores,
                         mean_train_scores + std_train_scores, alpha=0.1,
                         color="b")
plt.plot(cppalplhaArray, mean_test_scores, label='Test Score', color='r')
plt.fill_between(cppalplhaArray, mean_test_scores - std_test_scores,
                         mean_test_scores + std_test_scores, alpha=0.1,
                         color="r")
plt.grid()
#plt.xscale('log')
plt.legend()

In [None]:
# Search over a metric
learningRateArray = np.linspace(0.1,1.5, 8)

train_scores, test_scores = validation_curve(
    baseBDT, features, labels, param_name='learning_rate',
    param_range=learningRateArray, n_jobs=nCores, verbose=9, cv=cv)

mean_train_scores = np.mean(train_scores, axis=1)
mean_test_scores = np.mean(test_scores, axis=1)

std_train_scores = np.std(train_scores, axis=1)
std_test_scores = np.std(test_scores, axis=1)

print ("Means: "+str(mean_test_scores)+" and std. "
       +str(std_test_scores))

In [None]:
# Plot grid search
fig, ax = plt.subplots(figsize=(8,6))
plt.plot(learningRateArray, mean_train_scores, label='Train Score', color='b')
plt.fill_between(learningRateArray, mean_train_scores - std_train_scores,
                         mean_train_scores + std_train_scores, alpha=0.1,
                         color="b")
plt.plot(learningRateArray, mean_test_scores, label='Test Score', color='r')
plt.fill_between(learningRateArray, mean_test_scores - std_test_scores,
                         mean_test_scores + std_test_scores, alpha=0.1,
                         color="r")
plt.grid()
#plt.xscale('log')
plt.legend()
plt.savefig('LearningRate.pdf')

In [None]:
baseTree = DecisionTreeClassifier(max_depth=bestBDT.estimators_[0].max_depth)
baseBDT = AdaBoostClassifier(baseTree, algorithm='SAMME', random_state=42, n_estimators=bestBDT.n_estimators,
                             learning_rate=1.3)
baseBDT.fit(xTrain, yTrain)

In [None]:
# Plot importance of features
importanceDF = pd.DataFrame({'Features': featureNames, 'Importance Score':baseBDT.feature_importances_})
print (importanceDF.sort_values(by=['Importance Score']))
ax = importanceDF.sort_values(by=['Importance Score'])\
    .plot(kind='barh', x='Features', y='Importance Score')

In [None]:
# Print all tunable params
baseBDT.get_params().keys()

In [None]:
import PandoraBDT
from importlib import reload

reload (PandoraBDT)
from PandoraBDT import *

print (np.shape(xTest))
print (np.shape(yTest))
print (np.shape(xTrain))
print (np.shape(yTrain))

PlotBdtKSScores(baseBDT, xTest, yTest, xTrain, yTrain, 'Vertex Region', params)

In [None]:
# Plot ROC curves
fig, ax = plt.subplots(figsize=(8,6))
metrics.plot_roc_curve(baseBDT, xTest, yTest, ax=ax, name="Best")
metrics.plot_roc_curve(bdtArray[0][0], xTest, yTest, ax=ax, name="Base")

plt.title("ROC Curves")
ax.invert_xaxis()
ax.legend()
ax.grid()
plt.savefig('ROCFinal.pdf')

In [None]:
# Plot Confusion Matricies
fig, ax = plt.subplots(figsize=(8,6))
metrics.plot_confusion_matrix(baseBDT, xTest, yTest, display_labels=params['labelNames'],
                             ax=ax, normalize='true')
ax.invert_xaxis()
#ax.invert_zaxis()
plt.title("Confusion matrix (True Normalised)")
plt.show()
plt.savefig('ConfusionFinal.pdf')

In [None]:
WriteXmlFile(BDTName+".xml", baseBDT, BDTName)
SerializeToPkl(BDTName+".pkl", baseBDT)