# Generate ppt from the evaluated data

# Functions for dark mode

In [18]:
# Set the tqdm text color to white.

from IPython.display import HTML, display

def set_css_in_cell_output():
    display(HTML('''
        <style>
            .jupyter-widgets {color: #d5d5d5 !important;}
            .widget-label {color: #d5d5d5 !important;}
        </style>
    '''))

get_ipython().events.register('pre_run_cell', set_css_in_cell_output)

import warnings
warnings.filterwarnings('ignore')

In [19]:
checkpoints = {
    '13B': [23000, 43000, 63000, 83000, 103000, 123000, 143000],
    '13B_deduped': [23000, 43000, 63000, 83000, 103000, 123000, 143000],
    '6.7B': [23000, 43000, 63000, 83000, 103000, 123000, 143000],
    '6.7B_deduped': [23000, 43000, 63000, 83000, 103000, 123000, 143000],
    '2.7B': [23000, 43000, 63000, 83000, 103000, 123000, 143000],
    '1.3B': [11500, 21500, 31500, 41500, 51500, 61500, 71500],
    '1.3B_deduped': [11500, 21500, 31500, 41500, 51500, 61500, 71500],
    '800M': [23000, 43000, 63000, 83000, 103000, 123000, 143000],
    '800M_deduped': [23000, 43000, 63000, 83000, 103000, 123000, 143000],
    '350M': [11500, 21500, 31500, 41500, 51500, 61500, 71500],
    '350M_deduped': [11500, 21500, 31500, 41500, 51500, 61500, 71500],
    '125M': [11500, 21500, 31500, 41500, 51500, 61500, 71500],
    '125M_deduped': [11500, 21500, 31500, 41500, 51500, 61500, 71500]
}

# Title Slide

In [20]:
from pptx import Presentation

prs = Presentation()
title_layout = prs.slide_layouts[5]
slide = prs.slides.add_slide(title_layout)
title = slide.shapes.title

title.text = "Training Order does not Influence Memorization!"

# Motivating Idea

In [21]:
from pptx.util import Inches, Pt

title_content_layout = prs.slide_layouts[1]
slide = prs.slides.add_slide(title_content_layout)
slide.shapes.title.text = "Motivating Idea"
frame = slide.shapes.placeholders[1].text_frame
p = frame.paragraphs[0]
p.font.size = Pt(25)
p.text = ("Someone in EleutherAI mentioned that their mental model was that " 
"as new data was encountered by a transformer it would glom that data onto an internal "
"representation and then processes the whole to build a more efficient "
"representation. Datapoints that are memorized are ones that don’t fit nicely into "
"the final learned representation.")

p = frame.add_paragraph()
p.font.size = Pt(25)
p.text = ("It occurred to Stella that this mental model predicts that memorized datapoints are "
"more likely to be encountered later in training, as they’ve had less time to be "
"integrated into the representation.")


# Methodology

In [22]:
slide = prs.slides.add_slide(title_content_layout)
slide.shapes.title.text = "Methodology"
frame = slide.shapes.placeholders[1].text_frame
p = frame.paragraphs[0]
p.font.size = Pt(25)
p.text = ("Let us define a Sequence as a continuous list of 2048 tokens. " 
"The training data can then be described as a set of sequences.")

p = frame.add_paragraph()
p.font.size = Pt(25)
p.text = ("We take first 64 tokens from every sequence. Let's define first 32 of them as Context tokens "
"and the following 32 as true continuation. "
"We prompt the model with context tokens and greedily generate another 32 tokens. "
"Let's call them as predicted tokens.")

p = frame.add_paragraph()
p.font.size = Pt(25)
p.text = ("Memorization Accuracy is defined as the accuracy between predicted tokens "
"and it's true continuation. ")

# Linear Regression

In [23]:
def add_title_slide(title):
    slide = prs.slides.add_slide(title_layout)
    slide.shapes.title.text = title

add_title_slide("Linear Regression Plots")

In [24]:
def add_reg_to_slide(model, percentile):
    slide = prs.slides.add_slide(title_layout)
    title = f"{model} Line of best fit"
    if percentile != 0:
        title += f" @ top {percentile} most memorized data"
    slide.shapes.title.text = title
    
    plot_path = f'./plots/{model}-{percentile}_linear_regression.png'
    slide.shapes.add_picture(plot_path, Inches(0), Inches(1.7), Inches(10), Inches(5.5))



In [25]:
import pandas as pd
def add_reg_stats_to_slide(model, percentile):
    
    # Get the data
    all_model_stats = pd.read_csv('./results/linear_regression.csv')
    curr_model_stats = all_model_stats[
        (all_model_stats['model'] == model) & 
        (all_model_stats['most memorized percentile'] == percentile)]
    curr_model_stats['model-name'] = curr_model_stats['model'] + '-'
    curr_model_stats['model-name'] += curr_model_stats['checkpoint'].astype('str')
    curr_model_stats = curr_model_stats[[
        'model-name',  
        'slope', 
        'variation',
        '% change'
    ]]
    
    # Create a table slide
    slide = prs.slides.add_slide(title_layout)
    title = f"{model} Line of best fit"
    if percentile != 0:
        title += f" @ top {percentile} most memorized data"
    slide.shapes.title.text = title
    table = slide.shapes.add_table(8,4,Inches(0.5), Inches(1.7), Inches(9), Inches(5.5)).table
    
    # Headers
    table.cell(0, 0).text = 'Model'
    table.cell(0, 1).text = 'Slope'
    table.cell(0, 2).text = 'Variation'
    table.cell(0, 3).text = '% Change'
    
    # Adding data
    for i in range(7):
        data = curr_model_stats.iloc[i]
        table.cell(i+1, 0).text = data['model-name']
        table.cell(i+1, 1).text = f'{data["slope"]:.2e}'
        table.cell(i+1, 2).text = f'{data["variation"]:.2e}'
        table.cell(i+1, 3).text = f'{data["% change"]:.2e}'

In [26]:
for model in checkpoints.keys():
    for percentile in [0, 90, 99]:
        add_reg_to_slide(model, percentile)
        add_reg_stats_to_slide(model, percentile)

In [27]:
add_title_slide("Bucketed Memorization Plots")

In [28]:
def add_bucketed_memorization_to_slide(model, percentile):
    slide = prs.slides.add_slide(title_layout)
    title = f"{model} Bucketed Plot"
    if percentile != 0:
        title += f" @ top {percentile} most memorized data"
    slide.shapes.title.text = title
    
    plot_path = f'./plots/{model}-{percentile}_bucketed_memorization.png'
    slide.shapes.add_picture(plot_path, Inches(0), Inches(1.7), Inches(10), Inches(5.5))

In [29]:
for model in checkpoints.keys():
    for percentile in [0, 90, 99]:
        add_bucketed_memorization_to_slide(model, percentile)

# Scatter KDE plots

In [30]:
def add_kde_plots(model):
    for checkpoint in checkpoints[model]:
        model_name = f'{model}-{checkpoint}'
        slide = prs.slides.add_slide(title_layout)
        title = f"{model_name} Scatter-KDE Plot"
        slide.shapes.title.text = title
    
        plot_path = f'./plots/{model_name}_kde_plot.png'
        slide.shapes.add_picture(plot_path, Inches(0), Inches(1.7), Inches(10), Inches(5.5))
        

In [31]:
for model in checkpoints.keys():
    add_kde_plots(model)

In [32]:
def add_corr_plots(model):
    for checkpoint in checkpoints[model][:-1]:
        model_name = f'{model}-{checkpoint}'
        slide = prs.slides.add_slide(title_layout)
        title = f"{model_name} Normalized Correlation Plot"
        slide.shapes.title.text = title
    
        plot_path = f'./plots/{model_name}_normalized_correlation.png'
        slide.shapes.add_picture(plot_path, Inches(0), Inches(1.7), Inches(10), Inches(5.5))

In [33]:
for model in checkpoints.keys():
    add_corr_plots(model)

In [35]:
prs.save('Training_order_does_not_influence_memorization.pptx')