In [1]:
from result_records import DataFrameLoader

DENSE_SMALL = '/mnt/ssd-1/transformer-memorization/memorization_results_dense_small.csv'
DENSE_MEDIUM = '/mnt/ssd-1/transformer-memorization/memorization_results_dense_medium.csv'
DENSE_LARGE = '/mnt/ssd-1/transformer-memorization/memorization_results_dense_large.csv'

data_labels = [
    'dense_small (150M)',
    'dense_medium (350M)',
    'dense_large (750M)'
]

In [2]:
datasets = [DataFrameLoader(i) for i in [DENSE_SMALL,DENSE_MEDIUM,DENSE_LARGE]]

# Loading Data 

In [3]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm

data = {}
for i,(ds,label) in enumerate(zip(datasets,data_labels)):
    
    loss = []
    indicies = []
    for i,(idx,res) in tqdm(enumerate(ds)):
        if(not (np.isnan(res) or np.isinf(res))):
            loss.append(res)
            indicies.append(idx)
    data[label] = (np.array(indicies),np.array(loss))
    
        


73244it [00:00, 181379.22it/s]
36618it [00:00, 190930.80it/s]
36608it [00:00, 190731.18it/s]


# Memorization Metric plots
> Plotting average values of memorization metric over a bucketed range of values

In [4]:
# Run these if you don't see UI for the plots and refresh the page
#!pip install --upgrade jupyter_core jupyter_client
#!jupyter nbextension enable --py widgetsnbextension

In [5]:
from IPython.display import display
import matplotlib.pyplot as plt
import ipywidgets as widgets
%matplotlib inline

import numpy as np

class Plotter:
    def __init__(self,title,xlabel,ylabel,data,size=25,default_slider_value=None):
        self.title = title
        self.xlabel = xlabel
        self.ylabel = ylabel
        self.default_slider_value = default_slider_value
        self.data = data
        
        self.size = 25
        self.params = {'legend.fontsize': 'large',
          'figure.figsize': (15,5),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.75,
          'ytick.labelsize': size*0.75,
          'axes.titlepad': 25,
          'font.family':'sans-serif',
          'font.weight':'bold',
         }
    
    def plot_data(self,data,num_buckets,label):
        
        if(len(data) == 2):
            x,y = data
        elif(len(data) == 1):
            y, = data
            x = [i for i in range(len(data))]
        else:
            raise ValueError('Only two axis are supported') 
            
        assert len(x) == len(y)
        bucket_size = len(x)//num_buckets
        
        buckets = []
        index = []
        err = []
        for i in range(0,len(x),bucket_size):
            buckets.append(y[i:i+bucket_size].mean())
            index.append(x[min(i+bucket_size-1,len(x)-1)])
            err.append(y[i:i+bucket_size].std()/np.sqrt(bucket_size))
        
        buckets = buckets[:num_buckets]
        index = index[:num_buckets]
        err = err[:num_buckets]
        plt.errorbar(index,buckets,yerr = err,label=label,capsize=5)
    
    def plot(self,num_buckets):
        
        for (key,value) in self.data.items():
            self.plot_data(value,num_buckets,key)
        
            plt.rcParams.update(self.params)
            plt.title(self.title)
            plt.xlabel(self.xlabel)
            plt.ylabel(self.ylabel)
            plt.legend()
            plt.show()
    
    def clicked(self,b):
        self.out.clear_output()
        scale = self.slider.value
        with self.out:
            self.plot(scale)
    def run(self):
        self.out = widgets.Output()
        button = widgets.Button(description="Plot Value")
        slider_max = 50
        
        if(self.default_slider_value is not None):
            default_slider_value = self.default_slider_value
        else:
            default_slider_value = np.random.choice([i for i in range(1,slider_max)])
        self.slider = widgets.IntSlider(min=1, max=slider_max, 
                                   value=default_slider_value,
                                   description="Bins",
                                   layout=widgets.Layout(width='50%'))

        box_layout = widgets.Layout(
            display='flex',
            flex_flow='column',
            align_items='center',
            width='80%'
        )

        box = widgets.VBox(
            [
                self.out,
                self.slider,
                button
            ],
            layout=box_layout
        )

        

        with self.out:    
            self.plot(default_slider_value)
        button.on_click(self.clicked)
        display(box)
    
plotter = Plotter(title="Memorization Metric",
                  xlabel='Index',ylabel='NLL Loss',
                  data=data)
plotter.run() #Refresh if you donot see UI

VBox(children=(Output(), IntSlider(value=46, description='Bins', layout=Layout(width='50%'), max=50, min=1), B…

# Correlation

array([3.20000e+01, 3.30000e+01, 3.40000e+01, ..., 2.92865e+05,
       2.92866e+05, 2.92867e+05])

In [9]:
from scipy import signal

correlation = signal.correlate(indicies, data, mode="full")
plotter = Plotter(xlabel='indicies',ylabel='correlation',
                  title='Correlation',x=indicies,y=correlation,default_slider_value=11)
plotter.run()

VBox(children=(Output(), IntSlider(value=11, description='Bins', layout=Layout(width='50%'), max=16, min=1), B…

# Statistics

In [10]:
import matplotlib.pyplot as plt

SAMPLE_VALUE = len(data)*25//100
from sklearn.metrics import r2_score
r2 = r2_score(indicies,data)
print(f"R2 Score between indicies and data: {r2:.5f}")
avg_start = data[:SAMPLE_VALUE].mean()
avg_end = data[SAMPLE_VALUE:].mean()

var_start = data[:SAMPLE_VALUE].var()
var_end = data[SAMPLE_VALUE:].var()
print(f"Average NLL Loss changed from {avg_start:.5f} to {avg_end:.5f}")
print(f"Varience of  NLL Loss changed from {var_start:.5f} to {var_end:.5f}")
print("Trend of very slight improvement continues")

R2 Score between indicies and data: -3.00108
Average NLL Loss changed from -8.98532 to -8.95018
Varience of  NLL Loss changed from 14.02080 to 13.66236
Trend of very slight improvement continues
