In [2]:
# Imports
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import *
import dill as pkl
import pandas as pd
from sklearn.metrics import mean_squared_error as loss_fn
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pdl
from xgboost.sklearn import XGBRegressor
from xgboost.sklearn import XGBClassifier
import copy
from scipy.spatial import KDTree
from sklearn.metrics import zero_one_loss
import os
import csv
import matplotlib.pyplot as plt
import importlib
import tqdm
import warnings
import time
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from PIL import Image, ImageDraw, ImageFont
import folktables
warnings.filterwarnings("ignore")

In [5]:
x_train = pd.read_csv('data/training_data.csv') 
y_train = np.genfromtxt('data/training_labels.csv', delimiter=',', dtype = float)
x_val = pd.read_csv('data/validation_data.csv') 
y_val = np.genfromtxt('data/validation_labels.csv', delimiter=',', dtype = float)
x_test = pd.read_csv('data/test_data.csv') 
y_test = np.genfromtxt('data/test_labels.csv', delimiter=',', dtype = float)

FileNotFoundError: [Errno 2] No such file or directory: 'data/training_data.csv'

In [None]:
# Local Model Base Regressor

from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor(max_depth = 1, random_state = 42)
clf.fit(x_train, y_train)
team_pdl = pdl.PointerDecisionList(clf, x_train, y_train, x_val, y_val, alpha = 100000, min_group_size = 1)

In [None]:
# Global Model Base Regressor

from sklearn.ensemble import GradientBoostingRegressor
global_clf = GradientBoostingRegressor(max_depth=4, n_estimators = 500, random_state = 42)
global_clf.fit(x_train, y_train)
global_pdl = pdl.PointerDecisionList(global_clf, x_train, y_train, x_val, y_val, alpha = 100000, min_group_size = 1)

### Figures 2, 6

In [None]:
# update with folder name of teams models. Of the form "alpha_xyz" if using comp_alpha_rerun.py or "teams" if downloading from dataset
pdl_folder = ""


if pdl_folder:
    errors_dict = {}
    for i in range(46):
        team_pdl = pdl.load_model(f"{pdl_folder}/teams/{i}/PDL", x_train, y_train, x_val, y_val)
        train_errors = team_pdl.track(x_train, y_train)
        val_errors = team_pdl.track(x_val, y_val)
        test_errors = team_pdl.track(x_test, y_test)
        errors_dict[i] = [train_errors, val_errors, test_errors]

    global_pdl = pdl.load_model(f"{pdl_folder}/global_pdl/PDL", x_train, y_train, x_val, y_val)
    train_errors = global_pdl.track(x_train, y_train)
    val_errors = global_pdl.track(x_val, y_val)
    test_errors = global_pdl.track(x_test, y_test)
    errors_dict[46] = [train_errors, val_errors, test_errors]

    max_updates = 0
    for i in range(47):
        updates = len(errors_dict[i][0])
        if updates > max_updates:
            max_updates = updates

    cp = sns.color_palette(n_colors = 47)

    fig = plt.figure(figsize = (10,10))
    x = range(max_updates)
    for i in range(47):
        y = list(np.sqrt(errors_dict[i][0])) + ([None] * (max_updates - len(errors_dict[i][0])))
        if i == 46:
            ax = sns.lineplot(x = x, y = y, color = "black", label = "Global Model")
        else:
            ax = sns.lineplot(x = x, y = y, color = cp[i], label = f"Team {i}")
    ax.legend(bbox_to_anchor=(1.02, 1.01), fontsize=8)
    ax.set_ylabel("Loss")
    ax.set_xlabel("Update Round")
    ax.set_title("Team Training Loss over Course of Updates")
    plt.savefig("paper_plots/team_train_errors_time.pdf", format="PDF", bbox_inches='tight')

    fig = plt.figure(figsize = (10,10))
    x = range(max_updates)
    for i in range(47):
        y = list(np.sqrt(errors_dict[i][1])) + ([None] * (max_updates - len(errors_dict[i][0])))
        if i == 46:
            ax = sns.lineplot(x = x, y = y, color = "black", label = "Global Model")
        else:
            ax = sns.lineplot(x = x, y = y, color = cp[i], label = f"Team {i}")
    ax.legend(bbox_to_anchor=(1.02, 1.01), fontsize=8)
    ax.set_ylabel("Loss")
    ax.set_xlabel("Update Round")
    ax.set_title("Team Validation Loss over Course of Updates")
    plt.savefig("paper_plots/team_val_errors_time.pdf", format="PDF", bbox_inches='tight')

    fig = plt.figure(figsize = (10,10))
    x = range(max_updates)
    for i in range(47):
        y = list(np.sqrt(errors_dict[i][2])) + ([None] * (max_updates - len(errors_dict[i][0])))
        if i == 46:
            ax = sns.lineplot(x = x, y = y, color = "black", label = "Global Model")
        else:
            ax = sns.lineplot(x = x, y = y, color = cp[i], label = f"Team {i}")
    ax.legend(bbox_to_anchor=(1.02, 1.01), fontsize=8)
    ax.set_ylabel("Loss")
    ax.set_xlabel("Update Round")
    ax.set_title("Team Test Loss over Course of Updates")
    plt.savefig("paper_plots/team_test_errors_time.pdf", format="PDF", bbox_inches='tight')

    max_updates = 0
    final_test_errors = []
    for i in range(47):
        final_test_errors.append(errors_dict[i][2][-1])

    top_ten = np.argsort(final_test_errors)[0:11]

    for i in top_ten:
        updates = len(errors_dict[i][0])
        if updates > max_updates:
            max_updates = updates
            
    cp = sns.color_palette(n_colors = 47)

    fig = plt.figure(figsize = (10,10))
    x = range(max_updates)
    for i in top_ten:
        y = list(np.sqrt(errors_dict[i][0])) + ([None] * (max_updates - len(errors_dict[i][0])))
        if i == 46:
            ax = sns.lineplot(x = x, y = y, color = "black", label = "Global Model")
        else:
            ax = sns.lineplot(x = x, y = y, color = cp[i], label = f"Team {i}")
    ax.legend(bbox_to_anchor=(1, 1), fontsize=8)
    ax.set_ylabel("Loss")
    ax.set_xlabel("Update Round")
    ax.set_title("Team Training Loss over Course of Updates (Top 10)")
    plt.savefig("paper_plots/team_train_errors_time_top_ten.pdf", format="PDF")

    fig = plt.figure(figsize = (10,10))
    x = range(max_updates)
    for i in top_ten:
        y = list(np.sqrt(errors_dict[i][1])) + ([None] * (max_updates - len(errors_dict[i][0])))
        if i == 46:
            ax = sns.lineplot(x = x, y = y, color = "black", label = "Global Model")
        else:
            ax = sns.lineplot(x = x, y = y, color = cp[i], label = f"Team {i}")
    ax.legend(bbox_to_anchor=(1, 1), fontsize=8)
    ax.set_ylabel("Loss")
    ax.set_xlabel("Update Round")
    ax.set_title("Team Validation Loss over Course of Updates (Top 10)")
    plt.savefig("paper_plots/team_val_errors_time_top_ten.pdf", format="PDF")

    fig = plt.figure(figsize = (10,10))
    x = range(max_updates)
    for i in top_ten:
        y = list(np.sqrt(errors_dict[i][2])) + ([None] * (max_updates - len(errors_dict[i][0])))
        if i == 46:
            ax = sns.lineplot(x = x, y = y, color = "black", label = "Global Model")
        else:
            ax = sns.lineplot(x = x, y = y, color = cp[i], label = f"Team {i}")
    ax.legend(bbox_to_anchor=(1, 1), fontsize=8)
    ax.set_ylabel("Loss")
    ax.set_xlabel("Update Round")
    ax.set_title("Team Test Loss over Course of Updates (Top 10)")
    plt.savefig("paper_plots/team_test_errors_time_top_ten.pdf", format="PDF")

### Figure 4

In [4]:
log_df = pd.read_csv("FINAL_PAPER.csv", index_col="Unnamed: 0")

columns = x_train.columns 

teams_column_usage = {}
for team in range(46):
    teams_column_usage[team] = {}
    data = log_df[log_df["TID"] == float(team)]
    for column in columns:
       teams_column_usage[team][column] = 0
       for index in range(len(data)):
           if column in data.iloc[index]["GrCo"]:
            teams_column_usage[team][column] += 1   

team = "Global"
teams_column_usage[team] = {}
for column in columns:
  teams_column_usage[team][column] = 0
  for index in range(len(log_df)):
      try:
        if column in log_df["GrCo"][index]:
            teams_column_usage[team][column] += 1 
      except:
         continue
      
for team in range(46):
    data = log_df[log_df["TID"]  == float(team)]
    teams_column_usage[team]["count"] = (log_df[log_df["TID"] == float(team)]["GrCo"] != "automatic").sum()
teams_column_usage["Global"]["count"] = (log_df["GrCo"] != "automatic").sum()     

teams = []
for team in range(46):
    team_focus = []
    for column in columns:
        team_focus.append(teams_column_usage[team][column] / teams_column_usage[team]["count"])
    teams.append(team_focus)
team_focus = []
team = "Global"
for column in columns:
    team_focus.append(teams_column_usage[team][column] / teams_column_usage[team]["count"])
teams.append(team_focus)

fig = plt.figure(figsize = (10,10))
x_labels = list(range(46))
x_labels.append("Global")
sns.heatmap(pd.DataFrame(np.array(teams), columns=columns).T,xticklabels=x_labels, yticklabels=columns, cmap=sns.color_palette("light:b", as_cmap=True))
plt.savefig("paper_plots/feature_usage_heatmap.pdf", format="PDF")

NameError: name 'x_train' is not defined

### Figure 7

In [None]:
team_predictions_array = []
for i in range(45):
    team_predictions_array.append(np.genfromtxt(f'old_teams/teams/{i}/test_predictions.csv', delimiter=',', dtype = float))
team_predictions_array = np.array(team_predictions_array)

In [None]:
if pdl_folder:
    team_predictions_array = []
    for i in range(45):
        team_predictions_array.append(np.genfromtxt(f'{pdl_folder}/teams/{i}/test_predictions.csv', delimiter=',', dtype = float))
    team_predictions_array = np.array(team_predictions_array)

    global_predictions = np.genfromtxt(f'{pdl_folder}/global_pdl/test_predictions.csv', delimiter=',', dtype = float)

    alpha = -100000
    fig = plt.figure(figsize=(10,10))
    plt.yscale('log')
    for team in [33,21,3,31,39]:
        delta = np.abs(global_predictions - team_predictions_array[team])
        indices = delta < .0001
        delta[indices] = 0
        order = np.argsort(delta)
        y_ordered = delta[order]
        x = range(len(delta))
        
        ax = sns.scatterplot(x = np.array(x)[0:-1:50], y=y_ordered[0:-1:50], edgecolors=None, linewidth=0, s = 10, label = f"Global - Team {team}")

    ax.set_ylabel("Prediction Difference (Log Scale)")
    ax.set_xlabel("Instances")
    ax.set_title(f"Test Prediction Differences of Global Model with Top 5 Teams", fontsize = 15)
    plt.savefig("tmp.pdf", format="pdf")