# Large Scale Optimal Classification Tree

This notebook document is a **python** implementation for the Optimal Classification Trees (OCTs) proposed by Dimitris Bertsimas and Jack Dunn, and a new algorithm called Large-Scale OCTs developed by Enhao Liu.


# Library Settings

In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
plt.style.use('seaborn-whitegrid')

from lsopt.tree import OptimalTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn import datasets
import graphviz

# Optimal Classification Tree

## Example: Iris Dataset

In [9]:
# current_path = os.path.abspath(os.path.dirname(__file__))
# data_path = os.path.join(current_path, "./tests/data/Iris.csv")
data_path = './tests/data/Iris.csv'

iris_data = pd.read_csv(data_path)
iris_data.drop(columns=["Id"], inplace=True)
iris_data_sub = iris_data.sample(n=20, random_state=1)

In [10]:
# get X from iris data_sub
# iris_sub_X = iris_data_sub.iloc[:, 0:4].to_numpy()
# iris_sub_Y_label = iris_data_sub["Species"].to_numpy()

iris_all_X = iris_data.iloc[:, 0:4].to_numpy()
iris_all_Y_label = iris_data["Species"].to_numpy()

In [12]:
# Define OptimalTreeClassifier
opt_tree = OptimalTreeClassifier(max_depth=3,
                                 min_samples_leaf=1,
                                 alpha=0.01,
                                 criterion="gini",
                                 solver="gurobi",
                                 time_limit=1,
                                 verbose=True,
                                 solver_options={'mip_cuts': 'auto',
                                                 'mip_gap_tol': 0.8,
                                                 'mip_focus': 'balance'}
                                 )


In [13]:
# Fit on X and y
opt_tree.fit(X=iris_all_X, y=iris_all_Y_label)

# Make prediction
Y_pred = opt_tree.predict(X=iris_all_X)
Y_pred_prob = opt_tree.predict_proba(X=iris_all_X)

# Check confusion matrix
print("Confusion Matrix :")
print(confusion_matrix(y_true=iris_all_Y_label,
                       y_pred=Y_pred))

print(classification_report(y_true=iris_all_Y_label,
                            y_pred=Y_pred))

# Plot Optimal Tree
feature_names = iris_data.columns.values[:4]
class_names = np.unique(iris_all_Y_label)

dot_data = tree.export_graphviz(opt_tree,
                                out_file=None,
                                feature_names=feature_names,
                                class_names=class_names,
                                label='all',
                                impurity=True,
                                node_ids=True,
                                filled=True,
                                rounded=True,
                                leaves_parallel=True,
                                special_characters=False)

graph = graphviz.Source(dot_data)
graph.format = 'png'
graph.render(filename='optimal_tree', directory='tests', view=True)

Using license file /Users/enhaoliu/gurobi.lic
Academic license - for non-commercial use only
Read LP format model from file /var/folders/5y/drctn5r14mzfjfjnld2ybpc80000gn/T/tmpmsocgrqm.pyomo.lp
Reading time = 0.03 seconds
x1315: 5067 rows, 1315 columns, 28910 nonzeros
Changed value of parameter TimeLimit to 60.0
   Prev: inf  Min: 0.0  Max: inf  Default: inf
Parameter Cuts unchanged
   Value: -1  Min: -1  Max: 3  Default: -1
Changed value of parameter MIPGap to 0.8
   Prev: 0.0001  Min: 0.0  Max: inf  Default: 0.0001
Parameter MIPFocus unchanged
   Value: 0  Min: 0  Max: 3  Default: 0
Gurobi Optimizer version 9.0.1 build v9.0.1rc0 (mac64)
Optimize a model with 5067 rows, 1315 columns and 28910 nonzeros
Model fingerprint: 0xa96d3d8d
Variable types: 48 continuous, 1267 integer (1267 binary)
Coefficient statistics:
  Matrix range     [2e-02, 2e+02]
  Objective range  [1e-02, 1e-02]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 2e+02]
Presolve removed 1 rows and 1 columns
Pr

'tests/optimal_tree.png'