In [1]:
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm
import numpy as np
import plotly.graph_objects as go

import plotly.io as pio
import plotly.offline as pyo
import plotly.express as px

from scipy import stats


%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd drive/My Drive/Perplexity Eval

/content/drive/My Drive/Perplexity Eval


In [4]:
df = pd.read_csv('hellaswag_results.csv')

In [5]:
df.head()

Unnamed: 0,acc,acc_norm,log_likelihood,num_words,num_bytes,word_perplexity,byte_perplexity,lang,model,template
0,0.0,0.0,-605.538757,46,743,521198.4,2.259157,Marathi,8b,chat
1,0.0,1.0,-541.022888,38,678,1524908.0,2.221025,Marathi,8b,chat
2,0.0,0.0,-528.661438,37,656,1604218.0,2.23868,Marathi,8b,chat
3,0.0,1.0,-349.893524,24,282,2145529.0,3.458231,Marathi,8b,chat
4,1.0,0.0,-518.971436,43,629,174399.4,2.282049,Marathi,8b,chat


In [6]:
df['log_word_ppl'] = df.word_perplexity.apply(lambda x : np.log(x))

trained_lang = ['Arabic','Chinese','Czech','Dutch','English',
 'French','German','Greek','Hebrew','Hindi','Indonesian',
 'Italian','Japanese','Korean','Persian','Polish','Portuguese',
 'Romanian','Russian','Spanish','Turkish','Ukrainian','Vietnamese']

df['trained'] = df.lang.apply(lambda x : 'trained' if x in trained_lang else 'untrained')

In [7]:
df.trained.value_counts()

Unnamed: 0_level_0,count
trained,Unnamed: 1_level_1
untrained,580608
trained,523212


In [8]:
len(df)

1103820

In [9]:
title_map = {'word_perplexity':'Word Perplexity',
            'log_word_ppl':'Log Word Perplexity',
            'byte_perplexity':'Byte Perplexity',
            'trained':'Trained',
            'acc_norm':'Accuracy (Norm)',
            'num_words':'Num Words',
             'num_bytes':'Num Bytes',
             'bytes_per_word':'Bytes Per Word',
            'lang':'Language'}

def get_axis_title(num_var):
    return title_map[num_var]

def get_title(group_vars, num_var, y_axis_var, color_var=None):

    if color_var:
        return f"{title_map[num_var]} by {title_map[y_axis_var]} and {title_map[color_var]}"
    else:
        return f"{title_map[num_var]} by {title_map[y_axis_var]}"



## Helper Functions

In [29]:
def scaled_boxplot(df, group_vars, num_var, y_axis_var, color_var=None, layout_var = None, order_by = None):

    if len(group_vars) == 2:
        return boxplot_2cat(df, group_vars, num_var, y_axis_var, color_var, layout_var, order_by)
    elif len(group_vars) == 1:
        return boxplot_1cat(df, group_vars, num_var, y_axis_var, layout_var, order_by)
    else:
        return ValueError('No of grouping vars is >2')




In [30]:
def boxplot_2cat(df, group_vars, num_var, y_axis_var, color_var, layout_var = None, order_by = None):

    out = df.groupby(group_vars)[num_var].describe().reset_index()
    out['IQR'] = out.apply(lambda x : x['75%'] - x['25%'],axis = 1 )
    out['upperfence'] = out.apply(lambda x : x['75%'] + 1.5* x['IQR'],axis = 1 )
    out['lowerfence'] = out.apply(lambda x : x['75%'] - 1.5* x['IQR'],axis = 1 )

    if order_by:
        out = out.sort_values(order_by)

    fig = go.Figure()

    # Add box traces for each 'acc' category
    for color_value in out[color_var].unique():
        plot_df = out[out[color_var] == color_value]

        fig.add_trace(go.Box(
            # x=df_filtered['byte_perplexity'],
            y=plot_df[y_axis_var],
            name=f'acc_norm={color_value}',
            orientation='h',
            marker_color='red' if color_value == 0 else 'green',  # Adjust colors as needed,
            boxpoints = False,
            median = plot_df['50%'],
            q1 = plot_df['25%'],
            q3 = plot_df['75%'],
            upperfence=plot_df['upperfence'],
            lowerfence=plot_df['lowerfence'],

        ))

    if layout_var:
        fig.update_layout(**layout_var)

    return fig

def boxplot_1cat(df, group_vars, num_var, y_axis_var, layout_var = None, order_by = None):

    out = df.groupby(group_vars)[num_var].describe().reset_index()
    out['IQR'] = out.apply(lambda x : x['75%'] - x['25%'],axis = 1 )
    out['upperfence'] = out.apply(lambda x : x['75%'] + 1.5* x['IQR'],axis = 1 )
    out['lowerfence'] = out.apply(lambda x : x['75%'] - 1.5* x['IQR'],axis = 1 )
    if order_by:
        out = out.sort_values(order_by)

    fig = go.Figure()

    # Add box traces for each 'acc' category
    plot_df = out

    fig.add_trace(go.Box(
        # x=df_filtered['byte_perplexity'],
        y=plot_df[y_axis_var],
        # name=f'acc={color_value}',
        orientation='h',
        # marker_color='blue' if color_value == 0 else 'red',  # Adjust colors as needed,
        boxpoints = False,
        median = plot_df['50%'],
        q1 = plot_df['25%'],
        q3 = plot_df['75%'],
        upperfence=plot_df['upperfence'],
        lowerfence=plot_df['lowerfence'],

    ))

    if layout_var:
        fig.update_layout(**layout_var)

    return fig



def remove_outliers(df, variable, lower_threshold, upper_threshold):
    """
    Remove outliers from a specific variable in a dataframe.

    Parameters:
    df (pd.DataFrame): The input dataframe
    variable (str): The name of the variable to remove outliers from
    lower_threshold (float): The lower percentile threshold (e.g., 1 for 1st percentile)
    upper_threshold (float): The upper percentile threshold (e.g., 99 for 99th percentile)

    Returns:
    pd.DataFrame: A new dataframe with outliers removed
    """
    # Make a copy of the dataframe to avoid modifying the original
    df_clean = df.copy()

    # Calculate the lower and upper bounds
    lower_bound = np.percentile(df_clean[variable], lower_threshold)
    upper_bound = np.percentile(df_clean[variable], upper_threshold)

    # Remove outliers
    df_clean = df_clean[(df_clean[variable] >= lower_bound) &
                        (df_clean[variable] <= upper_bound)]

    return df_clean

def remove_outliers_var(df, lower_threshold, upper_threshold):
    """
    Remove outliers from a specific variable in a dataframe.

    Parameters:
    df (pd.DataFrame): The input dataframe
    variable (str): The name of the variable to remove outliers from
    lower_threshold (float): The lower percentile threshold (e.g., 1 for 1st percentile)
    upper_threshold (float): The upper percentile threshold (e.g., 99 for 99th percentile)

    Returns:
    pd.DataFrame: A new dataframe with outliers removed
    """
    # Make a copy of the dataframe to avoid modifying the original
    df_clean = df.copy()

    # Calculate the lower and upper bounds
    lower_bound = np.percentile(df_clean, lower_threshold)
    upper_bound = np.percentile(df_clean, upper_threshold)

    # Remove outliers
    df_clean = df_clean[(df_clean >= lower_bound) &
                        (df_clean <= upper_bound)]

    return df_clean

In [31]:
def compare_means(df, binary_var, continuous_var, display = True, remove_outliers = None):
    # Split the dataframe
    if remove_outliers:
        df = remove_outliers(df, continuous_var)

    group1 = df[df[binary_var] == 0][continuous_var]
    group2 = df[df[binary_var] == 1][continuous_var]

    # Perform the t-test
    t_tstat, t_pval = stats.ttest_ind(group1, group2)
    mw_tstat, mw_pval = stats.mannwhitneyu(group1, group2, alternative='two-sided')

    # if display:
    #     print(f"T-statistic: {t_statistic}")
    #     print(f"P-value: {p_value}")

    #     # Print means for each group
    #     print(f"Mean for group 0: {group1.mean()}")
    #     print(f"Mean for group 1: {group2.mean()}")



    return True if mw_pval < 0.05 else False, True if t_pval < 0.05 else False, mw_pval, t_pval, group1.mean(), group2.mean(), group1.median(), group2.median(), np.percentile(group1, 25), np.percentile(group2, 25), np.percentile(group1, 75), np.percentile(group2, 75)


## Visualization focusing on Model Performance

In [15]:
result = df.groupby('lang')['acc_norm'].value_counts(normalize=True).unstack()
result = result.sort_values(by=1, ascending=False)

# Create the stacked bar chart
fig = go.Figure()

for acc_value in [0, 1]:
    fig.add_trace(go.Bar(
        x=result.index,
        y=result[acc_value],
        name=f'acc_norm = {acc_value}',
        text=[f'{value:.1%}' for value in result[acc_value]],
        textposition='auto'
    ))

# Update layout
fig.update_layout(
    title='Distribution of Model Performance across languages',
    xaxis_title='Language',
    yaxis_title='Percentage',
    barmode='stack',
    yaxis=dict(tickformat='.0%')
)

# Show the plot
fig.show()

In [16]:
result = df.groupby('trained')['acc_norm'].value_counts(normalize=True).unstack()
result = result.sort_values(by=1, ascending=False)

# Create the stacked bar chart
fig = go.Figure()

for acc_value in [0, 1]:
    fig.add_trace(go.Bar(
        x=result.index,
        y=result[acc_value],
        name=f'acc_norm = {acc_value}',
        text=[f'{value:.1%}' for value in result[acc_value]],
        textposition='auto'
    ))

# Update layout
fig.update_layout(
    title='Distribution of Model Performance across trained/untrained languages',
    xaxis_title='Language',
    yaxis_title='Percentage',
    barmode='stack',
    yaxis=dict(tickformat='.0%')
)

# Show the plot
fig.show()

In [17]:
result = df.groupby('model')['acc_norm'].value_counts(normalize=True).unstack()
result = result.sort_values(by=1, ascending=False)

# Create the stacked bar chart
fig = go.Figure()

for acc_value in [0, 1]:
    fig.add_trace(go.Bar(
        x=result.index,
        y=result[acc_value],
        name=f'acc_norm = {acc_value}',
        text=[f'{value:.1%}' for value in result[acc_value]],
        textposition='auto'
    ))

# Update layout
fig.update_layout(
    title='Distribution of Model Performance across model',
    xaxis_title='Model',
    yaxis_title='Percentage',
    barmode='stack',
    yaxis=dict(tickformat='.0%')
)

# Show the plot
fig.show()

In [18]:
result = df.groupby('template')['acc_norm'].value_counts(normalize=True).unstack()
result = result.sort_values(by=1, ascending=False)

# Create the stacked bar chart
fig = go.Figure()

for acc_value in [0, 1]:
    fig.add_trace(go.Bar(
        x=result.index,
        y=result[acc_value],
        name=f'acc_norm = {acc_value}',
        text=[f'{value:.1%}' for value in result[acc_value]],
        textposition='auto'
    ))

# Update layout
fig.update_layout(
    title='Distribution of Model Performance across Chat Template',
    xaxis_title='Chat Template',
    yaxis_title='Percentage',
    barmode='stack',
    yaxis=dict(tickformat='.0%')
)

# Show the plot
fig.show()

### Model performance against log likelihood

In [19]:
df['likelihood_bucket'] = pd.qcut(df.log_likelihood, q=20, labels = [str(i) for i in range(20)])
acc_mean = df.groupby('likelihood_bucket')['acc_norm'].agg(['mean','count']).reset_index()

# Create the bar plot
fig = go.Figure(data=[go.Bar(
    x=acc_mean['likelihood_bucket'],
    y=acc_mean['mean'],
    text=acc_mean['mean'].round(2),
    textposition='auto',
)])

fig.update_layout(
    title='Average Accuracy (Norm) by Log Likelihood',
    xaxis_title='Log Likelihood Bucket',
    yaxis_title='Average Accuracy (Norm)',
    bargap=0.1
)

fig.show()






### Model performance against num of words

In [20]:
df['wordcount_bucket'] = pd.qcut(df.num_words, q=20, labels = [str(i) for i in range(20)])
acc_mean = df.groupby('wordcount_bucket')['acc_norm'].agg(['mean','count']).reset_index()

# Create the bar plot
fig = go.Figure(data=[go.Bar(
    x=acc_mean['wordcount_bucket'],
    y=acc_mean['mean'],
    text=acc_mean['mean'].round(2),
    textposition='auto',
)])

fig.update_layout(
    title='Average Accuracy (Norm) by Word Count',
    xaxis_title='Word Count Bucket',
    yaxis_title='Average Accuracy (Norm)',
    bargap=0.1
)

fig.show()






In [21]:
sub = df[df.lang=='Spanish'].copy()
sub['wordcount_bucket'] = pd.qcut(sub.num_words, q=20, labels = [str(i) for i in range(20)])
acc_mean = sub.groupby('wordcount_bucket')['acc_norm'].agg(['mean','count']).reset_index()

# Create the bar plot
fig = go.Figure(data=[go.Bar(
    x=acc_mean['wordcount_bucket'],
    y=acc_mean['mean'],
    text=acc_mean['mean'].round(2),
    textposition='auto',
)])

fig.update_layout(
    title='Average Accuracy (Norm) by Word Count (Spanish)',
    xaxis_title='Word Count Bucket',
    yaxis_title='Average Accuracy (Norm)',
    bargap=0.1
)

fig.show()






## Visualization focusing on Num Words/ Bytes

In [22]:
group_vars = ['lang']
num_var = 'num_words'
y_axis_var = 'lang'
color_var = None


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':get_title(group_vars, num_var, y_axis_var, color_var),
            'xaxis_title':get_axis_title(num_var),
            'yaxis_title':get_axis_title(y_axis_var),
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

In [23]:
group_vars = ['lang']
num_var = 'num_bytes'
y_axis_var = 'lang'
color_var = None


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':get_title(group_vars, num_var, y_axis_var, color_var),
            'xaxis_title':get_axis_title(num_var),
            'yaxis_title':get_axis_title(y_axis_var),
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

In [24]:
df['bytes_per_word'] = df.apply(lambda x : x.num_bytes/x.num_words, axis = 1)

group_vars = ['lang']
num_var = 'bytes_per_word'
y_axis_var = 'lang'
color_var = None


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':get_title(group_vars, num_var, y_axis_var, color_var),
            'xaxis_title':get_axis_title(num_var),
            'yaxis_title':get_axis_title(y_axis_var),
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

## Visualization focusing on Byte Perplexity

In [26]:
scaled_boxplot(df, group_vars=['acc_norm'], num_var='byte_perplexity',y_axis_var='acc_norm', layout_var={'title':'Byte Perplexity by Accuracy (Norm)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Accuracy (Norm)',
            'boxmode':'group',
            'height': 400} )

In [32]:
scaled_boxplot(df, group_vars=['lang','acc_norm'], num_var = 'byte_perplexity', y_axis_var='lang',color_var='acc_norm',
               layout_var={'title':'Byte Perplexity by Language and Accuracy (Norm)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 1200},
               order_by = '50%'
)

In [33]:

scaled_boxplot(df[df.trained == 'trained'], group_vars=['lang','acc_norm'], num_var = 'byte_perplexity', y_axis_var='lang',color_var='acc_norm',
               layout_var={'title':'Byte Perplexity by Language and Accuracy (Norm)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

## Visualization focusing on Word Perplexity

### Byte Perplexity by Model

In [34]:
scaled_boxplot(df, group_vars=['model'], num_var='word_perplexity',y_axis_var='model', layout_var={'title':'Word Perplexity by Model',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Model',
            'boxmode':'group',
            'height': 400} )


# fig = px.box(df, x="byte_perplexity", y="model", title="Distribution of Byte Perplexity by Model")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title="Model")
# fig.show()

In [35]:
scaled_boxplot(df, group_vars=['model'], num_var='log_word_ppl',y_axis_var='model', layout_var={'title':'Log Word Perplexity by Model',
            'xaxis_title':'Log Word Perplexity',
            'yaxis_title':'Model',
            'boxmode':'group',
            'height': 400} )


# fig = px.box(df, x="byte_perplexity", y="model", title="Distribution of Byte Perplexity by Model")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title="Model")
# fig.show()

### Word Perplexity by Chat Template

In [36]:
scaled_boxplot(df, group_vars=['template'], num_var='word_perplexity',y_axis_var='template', layout_var={'title':'Word Perplexity by Chat Template',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Chat Template',
            'boxmode':'group',
            'height': 400} )


# fig = px.box(df, x="byte_perplexity", y="template", title="Distribution of Byte Perplexity by Chat Template")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title=" Chat Template")
# fig.show()

### Word Perplexity by Language

In [37]:
scaled_boxplot(df, group_vars=['lang'], num_var='word_perplexity',y_axis_var='lang', layout_var={'title':'Word Perplexity by Language',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800},
              order_by='50%')


# fig = px.box(df, x="byte_perplexity", y="template", title="Distribution of Byte Perplexity by Chat Template")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title=" Chat Template")
# fig.show()

In [38]:
scaled_boxplot(df, group_vars=['lang'], num_var='log_word_ppl',y_axis_var='lang', layout_var={'title':'Log Word Perplexity by Language',
            'xaxis_title':'Log Word Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800},
              order_by='50%')


# fig = px.box(df, x="byte_perplexity", y="template", title="Distribution of Byte Perplexity by Chat Template")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title=" Chat Template")
# fig.show()

In [39]:
scaled_boxplot(df, group_vars=['trained'], num_var='word_perplexity',y_axis_var='trained', layout_var={'title':'Word Perplexity by Trained',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Trained',
            'boxmode':'group',
            'height': 800},
              order_by='50%')


# fig = px.box(df, x="byte_perplexity", y="template", title="Distribution of Byte Perplexity by Chat Template")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title=" Chat Template")
# fig.show()

In [40]:
scaled_boxplot(df, group_vars=['trained'], num_var='log_word_ppl',y_axis_var='trained', layout_var={'title':'Log Word Perplexity by Trained',
            'xaxis_title':'Log Word Perplexity',
            'yaxis_title':'Trained',
            'boxmode':'group',
            'height': 400},
              order_by='50%')


# fig = px.box(df, x="byte_perplexity", y="template", title="Distribution of Byte Perplexity by Chat Template")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title=" Chat Template")
# fig.show()

### Word Perplexity by Accuracy

In [41]:
scaled_boxplot(df, group_vars=['acc_norm'], num_var='word_perplexity',y_axis_var='acc_norm', layout_var={'title':'Word Perplexity by Accuracy (Norm)',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Accuracy (Norm)',
            'boxmode':'group',
            'height': 400} )


# fig = px.box(df, x="byte_perplexity", y="template", title="Distribution of Byte Perplexity by Chat Template")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title=" Chat Template")
# fig.show()

In [42]:
scaled_boxplot(df, group_vars=['acc_norm'], num_var='log_word_ppl',y_axis_var='acc_norm', layout_var={'title':'Log Word Perplexity by Accuracy (Norm)',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Accuracy (Norm)',
            'boxmode':'group',
            'height': 400} )


# fig = px.box(df, x="byte_perplexity", y="template", title="Distribution of Byte Perplexity by Chat Template")
# fig.update_layout(xaxis_title="Byte Perplexity", yaxis_title=" Chat Template")
# fig.show()

### Word Perplexity vs Language vs Acc

In [43]:
df_  = df[df.lang.isin(['English','Hindi','Spanish'])]
px.box(df_, y="lang", x="byte_perplexity", color="acc_norm", orientation = 'h')

Output hidden; open in https://colab.research.google.com to view.

In [44]:
scaled_boxplot(df, group_vars=['lang','acc_norm'], num_var = 'word_perplexity', y_axis_var='lang',color_var='acc_norm',
               layout_var={'title':'Word Perplexity by Language and Accuracy (Norm)',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 1200},
               order_by = '50%'
)

In [45]:
scaled_boxplot(df, group_vars=['lang','acc_norm'], num_var = 'log_word_ppl', y_axis_var='lang',color_var='acc_norm',
               layout_var={'title':'Log Word Perplexity by Language and Accuracy (Norm)',
            'xaxis_title':'Log Word Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 1200
            },
               order_by = '50%'
)

In [46]:
scaled_boxplot(df[df.trained == 'trained'], group_vars=['lang','acc_norm'], num_var = 'log_word_ppl', y_axis_var='lang',color_var='acc_norm',
               layout_var={'title':'Log Word Perplexity by Language and Accuracy (Norm)',
            'xaxis_title':'Log Word Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800
            },
               order_by = '50%'
)

In [48]:
group_vars = ['trained','acc_norm']
num_var = 'word_perplexity'
y_axis_var = 'trained'
color_var = 'acc_norm'


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':get_title(group_vars, num_var, y_axis_var, color_var),
            'xaxis_title':get_axis_title(num_var),
            'yaxis_title':get_axis_title(y_axis_var),
            'boxmode':'group',
            'height': 400},
               order_by = '50%'
)

In [49]:
scaled_boxplot(df, group_vars=['trained','acc_norm'], num_var = 'log_word_ppl', y_axis_var='trained',color_var='acc_norm',
               layout_var={'title':'Log Word Perplexity by Trained and Accuracy (Norm)',
            'xaxis_title':'Log Word Perplexity',
            'yaxis_title':'Trained',
            'boxmode':'group',
            'height': 400},
               order_by = '50%'
)

In [50]:
grouped = df.groupby(['model', 'template'])['byte_perplexity'].agg([
    'min', 'q1', 'median', 'q3', 'max'
]).reset_index()

# Step 2: Create the box plot
fig = go.Figure()

for template in grouped['template'].unique():
    template_data = grouped[grouped['template'] == template]

    fig.add_trace(go.Box(
        x=template_data['model'],
        y=template_data['median'],
        name=f'Template {template}',
        lowerfence=template_data['min'],
        q1=template_data['q1'],
        median=template_data['median'],
        q3=template_data['q3'],
        upperfence=template_data['max'],
        marker_color='blue' if template == 0 else 'red'
    ))

# Customize the layout
fig.update_layout(
    title='Byte Perplexity by Model and Template',
    xaxis_title='Model',
    yaxis_title='Byte Perplexity',
    boxmode='group'
)

fig.show()

AttributeError: 'SeriesGroupBy' object has no attribute 'q1'

In [None]:
grouped = df.groupby(['model', 'template'])['byte_perplexity'].agg(['min', 'q1', 'median', 'q3', 'max']).reset_index()

AttributeError: 'SeriesGroupBy' object has no attribute 'q1'

In [None]:
len(df)

1103820

## Visualization focusing on word perplexity and byte perplexity

In [51]:

df['word_by_byte_ppl'] = df.apply(lambda x : x.word_perplexity/x.byte_perplexity, axis = 1)

df['log_word_by_byte_ppl'] = df.apply(lambda x : x.log_word_ppl/x.byte_perplexity, axis = 1)




In [52]:
group_vars = ['lang']
num_var = 'word_by_byte_ppl'
y_axis_var = 'lang'
color_var = None


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':'Word / Byte Perplexity by Language',
            'xaxis_title':'Word / Byte Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

In [53]:
group_vars = ['lang']
num_var = 'log_word_by_byte_ppl'
y_axis_var = 'lang'
color_var = None


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':'Log(Word) / Byte Perplexity by Language',
            'xaxis_title':'Log(Word) / Byte Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

In [54]:
df[df.lang=='Malayalam'].log_word_by_byte_ppl.describe()

Unnamed: 0,log_word_by_byte_ppl
count,35732.0
mean,7.548172
std,0.920257
min,2.32128
25%,6.940786
50%,7.528507
75%,8.13138
max,14.057095


In [55]:
df[df.lang=='Malayalam'].word_by_byte_ppl.describe()

Unnamed: 0,word_by_byte_ppl
count,35732.0
mean,6753116000.0
std,883646600000.0
min,126.7667
25%,74985.45
50%,287963.3
75%,1344848.0
max,130207200000000.0


In [56]:
group_vars = ['lang']
num_var = 'word_perplexity'
y_axis_var = 'lang'
color_var = None


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':'Word Perplexity by Language',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

In [57]:
group_vars = ['lang']
num_var = 'log_word_ppl'
y_axis_var = 'lang'
color_var = None


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':'Log(Word) Perplexity by Language',
            'xaxis_title':'Log(Word) Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

In [58]:
group_vars = ['lang']
num_var = 'byte_perplexity'
y_axis_var = 'lang'
color_var = None


scaled_boxplot(df, group_vars=group_vars, num_var = num_var, y_axis_var=y_axis_var,color_var=color_var,
               layout_var={'title':'Log(Word) Perplexity by Language',
            'xaxis_title':'Log(Word) Perplexity',
            'yaxis_title':'Language',
            'boxmode':'group',
            'height': 800},
               order_by = '50%'
)

## Statistical Test

### Without outlier removal

In [60]:
results = []

for lang in df.lang.unique():
    sub = df[df.lang==lang]

    output = [lang]

    for metric in ['word_perplexity','byte_perplexity','log_word_ppl']:

        # is_significant, t_stat, p_value, g1mean, g2mean = compare_means(sub, 'acc_norm', metric, display=False)
        # output += [is_significant, t_stat, p_value, g1mean, g2mean, True if g2mean<g1mean else False]

        mw_significant, t_significant, mw_pval, t_pval, g1mean, g2mean, g1median, g2median, g1perc25, g2perc25, g1perc75, g2perc75 = compare_means(sub, 'acc_norm', metric, display=False)
        output += [mw_significant, t_significant, mw_pval, t_pval, g1mean, g2mean, g1perc25, g2perc25, g1median, g2median, g1perc75, g2perc75, True if g2mean<g1mean else False]

    results.append(output)

results = pd.DataFrame(results)

In [62]:
# results.columns =['lang',
#                   'word_ppl','t_stat','p_val','acc_norm = 0 mean','acc_norm = 1 mean','word_ppl_trend',
#                  'byte_ppl','t_stat','p_val','acc_norm = 0 mean','acc_norm = 1 mean','byte_ppl_trend',
#                  'log_word_ppl','t_stat','p_val','acc_norm = 0 mean','acc_norm = 1 mean','log_word_ppl_trend']

# results

results.columns =['lang',
                  'mw_significant', 't_significant', 'mw_pval','t_pval','acc_norm_0_mean','acc1_mean','acc_norm_0_p25','acc_norm_1_p25','acc_norm_0_p50','acc_norm_1_p50','acc_norm_0_p75','acc_norm_1_p75','word_ppl_trend',
                 'mw_significant', 't_significant', 'mw_pval','t_pval','acc_norm_0_mean','acc1_mean','acc_norm_0_p25','acc_norm_1_p25','acc_norm_0_p50','acc_norm_1_p50','acc_norm_0_p75','acc_norm_1_p75','byte_ppl_trend',
                 'mw_significant', 't_significant', 'mw_pval','t_pval','acc_norm_0_mean','acc1_mean','acc_norm_0_p25','acc_norm_1_p25','acc_norm_0_p50','acc_norm_1_p50','acc_norm_0_p75','acc_norm_1_p75','log_word_ppl_trend']

results['trained'] = results.lang.apply(lambda x : 'trained' if x in trained_lang else 'untrained')

results

Unnamed: 0,lang,mw_significant,t_significant,mw_pval,t_pval,acc_norm_0_mean,acc1_mean,acc_norm_0_p25,acc_norm_1_p25,acc_norm_0_p50,...,acc_norm_0_mean.1,acc1_mean.1,acc_norm_0_p25.1,acc_norm_1_p25.1,acc_norm_0_p50.1,acc_norm_1_p50,acc_norm_0_p75,acc_norm_1_p75,log_word_ppl_trend,trained
0,Marathi,True,False,0.0144097,0.774218,2439208000.0,1594844000.0,64647.044414,58111.762482,260588.3,...,12.704723,12.605048,11.076698,10.970123,12.470697,12.43551,14.096791,14.058555,True,untrained
1,Gujarati,False,False,0.4512651,0.109468,5969494.0,12704640.0,10441.93305,10033.151054,33001.48,...,10.632644,10.663976,9.253585,9.21365,10.404308,10.410899,11.715083,11.804942,False,untrained
2,Russian,True,False,4.633871e-86,0.350252,8723632.0,3192506.0,193.209334,139.486955,599.8462,...,6.944859,6.49972,5.263774,4.937971,6.396673,6.037124,7.955632,7.347873,True,trained
3,Dutch,True,False,9.08617e-71,0.602382,54239.09,105513.6,84.387449,63.503082,233.8894,...,5.829545,5.486985,4.435419,4.151088,5.454848,5.156227,6.73987,6.335448,True,trained
4,Telugu,False,False,0.09806635,0.466014,43710420000.0,3364856000.0,380004.255083,382706.27449,1704890.0,...,14.603173,14.663739,12.847938,12.855023,14.349011,14.373961,16.013367,16.101847,False,untrained
5,Portuguese,True,False,5.913443000000001e-119,0.053798,16623.84,7290.904,65.359394,46.011284,160.4019,...,5.460838,5.038003,4.179901,3.828887,5.077682,4.717496,6.288706,5.749676,True,trained
6,Catalan,True,False,5.2433919999999995e-86,0.070838,156558.1,91927.88,369.433021,270.595121,1026.118,...,7.320395,6.938776,5.911969,5.600624,6.933538,6.575455,8.274736,7.829965,True,untrained
7,Italian,True,False,5.873117999999999e-130,0.550865,43953.17,218038.4,75.031226,51.630527,190.4043,...,5.643119,5.186318,4.317904,3.944113,5.24915,4.871581,6.514199,5.929203,True,trained
8,French,True,False,8.225037999999999e-133,0.921425,9721.699,10378.7,54.305841,37.616104,132.4057,...,5.261312,4.824813,3.994632,3.627432,4.885871,4.505237,6.089929,5.52423,True,trained
9,Bengali,False,False,0.1250713,0.247628,10962160.0,18918780.0,5122.481329,5045.594267,15557.25,...,9.987195,10.045458,8.541394,8.526271,9.652282,9.697668,11.027767,11.110216,False,untrained


In [64]:
results[['word_ppl_trend','byte_ppl_trend','log_word_ppl_trend']].sum()

Unnamed: 0,0
word_ppl_trend,20
byte_ppl_trend,24
log_word_ppl_trend,26


In [67]:
results.groupby(['byte_ppl','byte_ppl_trend'])['lang'].count().reset_index()

KeyError: 'byte_ppl'

In [68]:
results.groupby(['word_ppl','word_ppl_trend'])['lang'].count().reset_index()

KeyError: 'word_ppl'

In [None]:
results.groupby(['log_word_ppl','log_word_ppl_trend'])['lang'].count().reset_index()

Unnamed: 0,log_word_ppl,log_word_ppl_trend,lang
0,False,False,2
1,False,True,5
2,True,False,6
3,True,True,17


### With outlier removal

In [69]:
from functools import partial

remove_outliers_custom = partial(remove_outliers, lower_threshold = 0, upper_threshold = 99)

results = []

for lang in df.lang.unique():
    sub = df[df.lang==lang]

    output = [lang]

    for metric in ['word_perplexity','byte_perplexity','log_word_ppl']:

        mw_significant, t_significant, mw_pval, t_pval, g1mean, g2mean, g1median, g2median, g1perc25, g2perc25, g1perc75, g2perc75 = compare_means(sub, 'acc_norm', metric, display=False, remove_outliers= remove_outliers_custom)

        output += [mw_significant, t_significant, mw_pval, t_pval, g1mean, g2mean, g1perc25, g2perc25, g1median, g2median, g1perc75, g2perc75, True if g2mean<g1mean else False]

    results.append(output)

results = pd.DataFrame(results)





In [70]:
results.columns =['lang',
                  'mw_significant', 't_significant', 'mw_pval','t_pval','acc0_mean','acc1_mean','acc0_p25','acc1_p25','acc0_p50','acc1_p50','acc0_p75','acc1_p75','word_ppl_trend',
                 'mw_significant', 't_significant', 'mw_pval','t_pval','acc0_mean','acc1_mean','acc0_p25','acc1_p25','acc0_p50','acc1_p50','acc0_p75','acc1_p75','byte_ppl_trend',
                 'mw_significant', 't_significant', 'mw_pval','t_pval','acc0_mean','acc1_mean','acc0_p25','acc1_p25','acc0_p50','acc1_p50','acc0_p75','acc1_p75','log_word_ppl_trend']

results['trained'] = results.lang.apply(lambda x : 'trained' if x in trained_lang else 'untrained')

results

Unnamed: 0,lang,mw_significant,t_significant,mw_pval,t_pval,acc0_mean,acc1_mean,acc0_p25,acc1_p25,acc0_p50,...,acc0_mean.1,acc1_mean.1,acc0_p25.1,acc1_p25.1,acc0_p50.1,acc1_p50,acc0_p75,acc1_p75,log_word_ppl_trend,trained
0,Marathi,True,False,0.01278425,0.9116322,7610366.0,7559592.0,63623.041119,57270.674178,253012.4,...,12.61023,12.507057,11.060731,10.955544,12.441194,12.399043,14.029812,13.995856,True,untrained
1,Gujarati,False,False,0.710187,0.07828074,404436.3,445125.8,10293.65521,9907.54267,32308.0,...,10.55713,10.566386,9.239283,9.201052,10.38307,10.388404,11.663314,11.744418,False,untrained
2,Russian,True,True,1.984872e-82,2.08921e-05,32411.53,23981.64,190.227715,138.296703,582.8851,...,6.819743,6.41545,5.248222,4.929401,6.36799,6.017174,7.866485,7.293739,True,trained
3,Dutch,True,True,2.8641740000000002e-67,2.547987e-07,2958.704,2227.696,83.238467,63.03256,228.0994,...,5.727627,5.416575,4.42171,4.143651,5.429781,5.142686,6.67579,6.288144,True,trained
4,Telugu,False,False,0.1002649,0.1736226,53722250.0,59013110.0,374025.362099,375163.223617,1658415.0,...,14.503544,14.562867,12.832079,12.835116,14.321373,14.338821,15.948645,16.039959,False,untrained
5,Portuguese,True,True,9.884734e-114,1.124311e-15,1469.151,1009.226,64.601928,45.713152,156.9205,...,5.36298,4.976439,4.168244,3.822386,5.055739,4.703055,6.213856,5.714749,True,trained
6,Catalan,True,True,1.009075e-83,2.976622e-08,15385.76,11465.77,364.369866,268.518996,999.8971,...,7.225083,6.870077,5.898169,5.592922,6.907652,6.558842,8.200497,7.784271,True,untrained
7,Italian,True,True,8.611451e-126,2.2033669999999998e-20,1916.324,1242.098,74.038031,51.328425,185.5267,...,5.546505,5.121871,4.304579,3.938245,5.223199,4.85551,6.442982,5.891178,True,trained
8,French,True,True,9.625277999999999e-130,5.109723e-19,1174.133,769.4736,53.698972,37.390252,129.4871,...,5.167573,4.759403,3.983394,3.62141,4.863582,4.491667,6.017418,5.49016,True,trained
9,Bengali,False,False,0.2608963,0.3562898,325344.6,344706.0,5065.881376,4951.435739,15268.38,...,9.904859,9.937625,8.530283,8.507433,9.633539,9.663164,10.976002,11.039123,False,untrained


In [71]:
results.to_csv('Stattest_results_outlier_treated.csv', index = False, index_label = False)

### Trained vs Untrained languages (with outlier removal)

In [72]:
from functools import partial

remove_outliers_custom = partial(remove_outliers, lower_threshold = 0, upper_threshold = 99)

results = []

for trained_value in df.trained.unique():
    sub = df[df.trained==trained_value]

    output = [trained_value]

    for metric in ['word_perplexity','byte_perplexity','log_word_ppl']:

        mw_significant, t_significant, mw_pval, t_pval, g1mean, g2mean, g1median, g2median, g1perc25, g2perc25, g1perc75, g2perc75 = compare_means(sub, 'acc_norm', metric, display=False, remove_outliers= remove_outliers_custom)

        output += [mw_significant, t_significant, mw_pval, t_pval, g1mean, g2mean, g1perc25, g2perc25, g1median, g2median, g1perc75, g2perc75, True if g2mean<g1mean else False]

    results.append(output)

results = pd.DataFrame(results)


In [73]:
results.columns =['trained',
                  'mw_significant', 't_significant', 'mw_pval','t_pval','acc0_mean','acc1_mean','acc0_p25','acc1_p25','acc0_p50','acc1_p50','acc0_p75','acc1_p75','word_ppl_trend',
                 'mw_significant', 't_significant', 'mw_pval','t_pval','acc0_mean','acc1_mean','acc0_p25','acc1_p25','acc0_p50','acc1_p50','acc0_p75','acc1_p75','byte_ppl_trend',
                 'mw_significant', 't_significant', 'mw_pval','t_pval','acc0_mean','acc1_mean','acc0_p25','acc1_p25','acc0_p50','acc1_p50','acc0_p75','acc1_p75','log_word_ppl_trend']

# results['trained'] = results.lang.apply(lambda x : 'trained' if x in trained_lang else 'untrained')

results

Unnamed: 0,trained,mw_significant,t_significant,mw_pval,t_pval,acc0_mean,acc1_mean,acc0_p25,acc1_p25,acc0_p50,...,t_pval.1,acc0_mean.1,acc1_mean.1,acc0_p25.1,acc1_p25.1,acc0_p50.1,acc1_p50,acc0_p75,acc1_p75,log_word_ppl_trend
0,untrained,True,True,0.0,3.276525e-54,18577330.0,14123330.0,8254.374942,1974.47293,88041.387413,...,0.0,11.544943,10.514781,9.018499,7.588057,11.385562,10.215064,13.807438,12.942341,True
1,trained,True,True,0.0,8.057908000000001e-179,5731.145,3656.8,87.874349,58.856627,256.135857,...,0.0,5.90761,5.41418,4.475908,4.075104,5.545708,5.056485,6.956375,6.344872,True


In [74]:
results.to_csv('Stattest_results_trained_vs_untrained_outlier_treated.csv', index = False, index_label = False)

## Case Study - Malayalam

In [75]:
sub = df[df.lang=='Malayalam']

### Word Perplexity by Accuracy

In [76]:
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='word_perplexity',y_axis_var='acc_norm', layout_var={'title':'Word Perplexity by Accuracy Norm (Malayalam)',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Accuracy Norm',
            'boxmode':'group',
            'height': 400} )

### Log Word Perplexity by Accuracy

In [77]:
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='log_word_ppl',y_axis_var='acc_norm', layout_var={'title':'Log Word Perplexity by Accuracy Norm (Malayalam)',
            'xaxis_title':'Log Word Perplexity',
            'yaxis_title':'Accuracy Norm',
            'boxmode':'group',
            'height': 400} )

In [78]:
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='byte_perplexity',y_axis_var='acc_norm', layout_var={'title':'Byte Perplexity by Accuracy Norm (Malayalam)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Accuracy (Norm)',
            'boxmode':'group',
            'height': 400} )

In [79]:
fig = go.Figure()

for acc_value in [0, 1]:
    subset = sub[sub['acc_norm'] == acc_value]
    subset = remove_outliers(subset, 'word_perplexity',0,99)

    fig.add_trace(go.Histogram(
        x=subset['word_perplexity'],
        name=f'ACC_norm = {acc_value}',
        opacity=0.7,
        nbinsx=30
    ))

# Update layout
fig.update_layout(
    title='Distribution of Word Perplexity by ACC_norm Value',
    xaxis_title='Word Perplexity',
    yaxis_title='Count',
    barmode='overlay'
)

# Add a legend
fig.update_layout(legend_title='ACC Norm Value')

# Show the plot
fig.show()

In [80]:
fig = go.Figure()

for acc_value in [0, 1]:
    subset = sub[sub['acc_norm'] == acc_value]
    subset = remove_outliers(subset, 'log_word_ppl',0,99)

    fig.add_trace(go.Histogram(
        x=subset['log_word_ppl'],
        name=f'ACC = {acc_value}',
        opacity=0.7,
        nbinsx=30
    ))

# Update layout
fig.update_layout(
    title='Distribution of Log Word Perplexity by ACC Norm Value',
    xaxis_title='Log Word Perplexity',
    yaxis_title='Count',
    barmode='overlay'
)

# Add a legend
fig.update_layout(legend_title='ACC Norm Value')

# Show the plot
fig.show()

In [81]:
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='byte_perplexity',y_axis_var='acc_norm', layout_var={'title':'Byte Perplexity by Accuracy Norm (Malayalam)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Accuracy (Norm)',
            'boxmode':'group',
            'height': 400} )

### Statistical Test

In [82]:
for metric in ['word_perplexity','byte_perplexity','log_word_ppl']:
    print(f"Metric : {metric}")
    compare_means(sub, 'acc_norm', metric)

Metric : word_perplexity
Metric : byte_perplexity
Metric : log_word_ppl


## Case Study - English



In [83]:
sub = df[df.lang=='English']

### Word Perplexity by Accuracy

In [84]:
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='word_perplexity',y_axis_var='acc_norm', layout_var={'title':'Word Perplexity by Accuracy Norm (English)',
            'xaxis_title':'Word Perplexity',
            'yaxis_title':'Accuracy Norm',
            'boxmode':'group',
            'height': 400} )

### Log Word Perplexity by Accuracy

In [85]:
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='log_word_ppl',y_axis_var='acc_norm', layout_var={'title':'Log Word Perplexity by Accuracy Norm (English)',
            'xaxis_title':'Log Word Perplexity',
            'yaxis_title':'Accuracy Norm',
            'boxmode':'group',
            'height': 400} )

In [86]:
fig = go.Figure()

for acc_value in [0, 1]:
    subset = sub[sub['acc_norm'] == acc_value]
    subset = remove_outliers(subset, 'word_perplexity',0,99)

    fig.add_trace(go.Histogram(
        x=subset['word_perplexity'],
        name=f'ACC_norm = {acc_value}',
        opacity=0.7,
        nbinsx=30
    ))

# Update layout
fig.update_layout(
    title='Distribution of Word Perplexity by ACC_norm Value',
    xaxis_title='Word Perplexity',
    yaxis_title='Count',
    barmode='overlay'
)

# Add a legend
fig.update_layout(legend_title='ACC Norm Value')

# Show the plot
fig.show()

In [87]:
fig = go.Figure()

for acc_value in [0, 1]:
    subset = sub[sub['acc_norm'] == acc_value]
    subset = remove_outliers(subset, 'log_word_ppl',0,99)

    fig.add_trace(go.Histogram(
        x=subset['log_word_ppl'],
        name=f'ACC_norm = {acc_value}',
        opacity=0.7,
        nbinsx=30
    ))

# Update layout
fig.update_layout(
    title='Distribution of Log Word Perplexity by ACC_norm Value',
    xaxis_title='Log Word Perplexity',
    yaxis_title='Count',
    barmode='overlay'
)

# Add a legend
fig.update_layout(legend_title='ACC_norm Value')

# Show the plot
fig.show()

In [88]:
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='byte_perplexity',y_axis_var='acc_norm', layout_var={'title':'Byte Perplexity by Accuracy Norm (English)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Accuracy Norm',
            'boxmode':'group',
            'height': 400} )

### Statistical Test

In [89]:
for metric in ['word_perplexity','byte_perplexity','log_word_ppl']:
    print(f"Metric : {metric}")
    compare_means(sub, 'acc_norm', metric)

Metric : word_perplexity
Metric : byte_perplexity
Metric : log_word_ppl


In [90]:
fig = go.Figure()
sub = df[df.trained == 'trained']

sub = remove_outliers(sub, 'byte_perplexity',0,99)

for acc_value in [0, 1]:
    subset = sub[sub['acc_norm'] == acc_value]


    fig.add_trace(go.Histogram(
        x=subset['byte_perplexity'],
        name=f'ACC_norm = {acc_value}',
        opacity=0.5,
        nbinsx=30
    ))

# Update layout
fig.update_layout(
    title='Distribution of Word Perplexity by ACC Norm Value',
    xaxis_title='Word Perplexity',
    yaxis_title='Count',
    barmode='overlay'
)

# Add a legend
fig.update_layout(legend_title='ACC Norm Value')

# Show the plot
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [91]:
sub = df[df.trained == 'untrained']
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='byte_perplexity',y_axis_var='acc_norm', layout_var={'title':'Byte Perplexity by Accuracy Norm (English)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Accuracy Norm',
            'boxmode':'group',
            'height': 400} )

In [92]:

scaled_boxplot(sub, group_vars=['acc_norm'], num_var='word_perplexity',y_axis_var='acc_norm', layout_var={'title':'Byte Perplexity by Accuracy Norm (English)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Accuracy Norm',
            'boxmode':'group',
            'height': 400} )

In [93]:
scaled_boxplot(sub, group_vars=['acc_norm'], num_var='log_word_ppl',y_axis_var='acc_norm', layout_var={'title':'Byte Perplexity by Accuracy Norm (English)',
            'xaxis_title':'Byte Perplexity',
            'yaxis_title':'Accuracy Norm',
            'boxmode':'group',
            'height': 400} )

In [94]:
sub.log_word_ppl.describe()

Unnamed: 0,log_word_ppl
count,580608.0
mean,11.28559
std,3.763805
min,2.62111
25%,8.485264
50%,11.048172
75%,13.614498
max,38.568878


In [95]:
sub.groupby('acc_norm').log_word_ppl.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
acc_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,367025.0,11.672658,3.675295,2.62111,9.050817,11.435262,13.908135,38.568878
1.0,213583.0,10.620446,3.820717,2.696844,7.609885,10.257005,13.025204,36.672783


In [96]:
sub.groupby('acc_norm').word_perplexity.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
acc_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,367025.0,256574400000.0,98388520000000.0,13.750973,8525.498283,92527.607966,1097050.0,5.626664e+16
1.0,213583.0,80148210000.0,20315840000000.0,14.832849,2018.045806,28481.348642,453705.5,8448649000000000.0


In [97]:
sub.groupby('acc_norm').byte_perplexity.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
acc_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,367025.0,3.16935,2.168371,1.234479,1.953404,2.434751,3.67998,126.800775
1.0,213583.0,3.251313,2.136351,1.241095,2.054447,2.627679,3.705969,58.777471
