In [3]:
import sys
sys.path.append('..')
from run_fft import FFTProcessor
import numpy as np
import pandas as pd

In [3]:
# Call ggplot in R to plot
%load_ext rpy2.ipython

### Circular processing

In [6]:
def circular(input: list, n: int = None, include_self: bool = True):
    if n is None:
        n = len(input) - 1
    output = []
    if include_self:
        output.append(input)
    for i in range(n):
        out = input[i+1:] + input[:i+1]
        output.append(out)
    return output

In [3]:
# Test
input = [1, 2, 3, 4, 5]
print(circular(input, include_self=True))
print(circular(input, include_self=False))

[[1, 2, 3, 4, 5], [2, 3, 4, 5, 1], [3, 4, 5, 1, 2], [4, 5, 1, 2, 3], [5, 1, 2, 3, 4]]
[[2, 3, 4, 5, 1], [3, 4, 5, 1, 2], [4, 5, 1, 2, 3], [5, 1, 2, 3, 4]]


In [4]:
def get_circular_full(input_file: str, require_sid=True):
    fft_processor = FFTProcessor(method='fft', preprocess='logzs', value='norm', require_sid=False)
    nll_raw = fft_processor._read_data(data_file=input_file)
    circle_results = []
    for i, nll in enumerate(nll_raw):
        nll_c = circular(nll)
        nll_c = fft_processor._preprocess(nll_c)
        f, p, sids = fft_processor._fft_batch(nll_c, require_sid=True) # Note this `require_sid` is different from the function argument
        df = pd.DataFrame({'freq': np.concatenate(f), 
                           'power': np.concatenate(p), 
                           'circular_index': np.concatenate(sids)}) # The `sids` returned from `_fft_batch` means the index of each circular operation
        if require_sid: # This is the actual sequence id
            df['sid'] = i
        circle_results.append(df)
    df_circle = pd.concat(circle_results)
    return df_circle

In [7]:
# Run get_circular_full
# est_name = 'mistral'
# est_name = 'gpt2xl'
est_name = 'gpt2xl'

df_circle_writing_orig = get_circular_full(f'../data/gpt-4/writing_gpt-4.original.{est_name}.nll.txt')
df_circle_writing_samp = get_circular_full(f'../data/gpt-4/writing_gpt-4.sampled.{est_name}.nll.txt')
print(df_circle_writing_orig.shape[0] + df_circle_writing_samp.shape[0])
df_circle_writing_orig['type'] = 'Human'
df_circle_writing_samp['type'] = 'Sampled'
df_circle_writing = pd.concat([df_circle_writing_orig, df_circle_writing_samp])

df_circle_pubmed_orig = get_circular_full(f'../data/gpt-4/pubmed_gpt-4.original.{est_name}.nll.txt')
df_circle_pubmed_samp = get_circular_full(f'../data/gpt-4/pubmed_gpt-4.sampled.{est_name}.nll.txt')
print(df_circle_pubmed_orig.shape[0] + df_circle_pubmed_samp.shape[0])
df_circle_pubmed_orig['type'] = 'Human'
df_circle_pubmed_samp['type'] = 'Sampled'
df_circle_pubmed = pd.concat([df_circle_pubmed_orig, df_circle_pubmed_samp])

df_circle_xsum_orig = get_circular_full(f'../data/gpt-4/xsum_gpt-4.original.{est_name}.nll.txt')
df_circle_xsum_samp = get_circular_full(f'../data/gpt-4/xsum_gpt-4.sampled.{est_name}.nll.txt')
print(df_circle_xsum_orig.shape[0] + df_circle_xsum_samp.shape[0])
df_circle_xsum_orig['type'] = 'Human'
df_circle_xsum_samp['type'] = 'Sampled'
df_circle_xsum = pd.concat([df_circle_xsum_orig, df_circle_xsum_samp])


6381006
736614
6533046


In [14]:
# Save circular full data
df_circle_pubmed.to_csv(f'../data/gpt-4/pubmed_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.txt', index=False)
df_circle_pubmed.to_hdf(f'../data/gpt-4/pubmed_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.h5', key='df', mode='w')

df_circle_writing.to_csv(f'../data/gpt-4/writing_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.txt', index=False)
df_circle_writing.to_hdf(f'../data/gpt-4/writing_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.h5', key='df', mode='w')

df_circle_xsum.to_csv(f'../data/gpt-4/xsum_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.txt', index=False)
df_circle_xsum.to_hdf(f'../data/gpt-4/xsum_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.h5', key='df', mode='w')


In [None]:
%%R -i df_circle_norm
require(ggplot2)
require(stringr)

# genre <- "pubmed_QA"
genre <- "writing"

# est_name <- "gpt2xl"
est_name <- "mistral"

# p <- ggplot(df_circle_norm, aes(x=freq, y=power, color=type)) + geom_smooth(method='gam') + 
#     theme_bw() + theme(plot.title = element_text(hjust = 0.5, vjust=-12, size = 12)) +
#     ggtitle(str_interp("${genre}: Human vs. GPT-4 \nNLL logzs, FFT norm, est ${est_name} \n Circular")) +
#     labs(x = bquote(omega[k]), y = bquote(X(omega[k])))
# ggsave(str_interp("gpt4_human_${genre}_${est_name}_nlllogzs_fftnorm_circle.pdf"), plot=p, width=5, height=5)

In [7]:
# For each nll sequence, use circular to compute n spectra, then calculte its mean

def get_circular_mean(input_file: str, require_sid=True):
    fft_processor = FFTProcessor(method='fft', preprocess='logzs', value='norm', require_sid=False)
    nlls = fft_processor._read_data(data_file=input_file)
    freqs, powers, sids = [], [], []
    for i, nll in enumerate(nlls):
        nll_circle = circular(nll)
        data = fft_processor._preprocess(nll_circle)
        freq, power, _ = fft_processor._fft_batch(data, verbose=False)
        power_mean = np.mean(power, axis=0)
        freqs.append(freq[0])
        powers.append(power_mean)
        sids.append(np.repeat(i, len(power_mean)))
    if require_sid:
        df = pd.DataFrame.from_dict({'freq': np.concatenate(freqs),
                                     'power': np.concatenate(powers),
                                     'sid': np.concatenate(sids)})
    else:
        df = pd.DataFrame.from_dict({'freq': np.concatenate(freqs),
                                'power': np.concatenate(powers)})
    return df

In [8]:
# Get circularmean data for GPT-4

genre_list = ['writing', 'pubmed', 'xsum']
est_name_list = ['mistral', 'gpt2xl']

# genre_list = ['pubmed']
# est_name_list = ['llama-13b']

for genre in genre_list:
    for est_name in est_name_list:
        df_circlemean_orig = get_circular_mean(f'../data/gpt-4/{genre}_gpt-4.original.{est_name}.nll.txt', require_sid=True)
        df_circlemean_samp = get_circular_mean(f'../data/gpt-4/{genre}_gpt-4.sampled.{est_name}.nll.txt', require_sid=True)
        df_circlemean_orig.to_csv(f'../data/gpt-4/{genre}_gpt-4.original.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)
        df_circlemean_samp.to_csv(f'../data/gpt-4/{genre}_gpt-4.sampled.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)

In [23]:
# Get circularmean data for GPT-3.5-Turbo

genre_list = ['writing', 'pubmed', 'xsum']
est_name_list = ['mistral', 'gpt2xl']

for genre in genre_list:
    for est_name in est_name_list:
        df_circlemean_orig = get_circular_mean(f'../data/gpt-3.5/{genre}_gpt-3.5-turbo.original.{est_name}.nll.txt', require_sid=True)
        df_circlemean_samp = get_circular_mean(f'../data/gpt-3.5/{genre}_gpt-3.5-turbo.sampled.{est_name}.nll.txt', require_sid=True)
        df_circlemean_orig.to_csv(f'../data/gpt-3.5/{genre}_gpt-3.5-turbo.original.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)
        df_circlemean_samp.to_csv(f'../data/gpt-3.5/{genre}_gpt-3.5-turbo.sampled.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)

In [25]:
# Get circularmean data for Davinci

genre_list = ['writing', 'pubmed', 'xsum']
est_name_list = ['mistral', 'gpt2xl']

for genre in genre_list:
    for est_name in est_name_list:
        df_circlemean_orig = get_circular_mean(f'../data/davinci/{genre}_davinci.original.{est_name}.nll.txt', require_sid=True)
        df_circlemean_samp = get_circular_mean(f'../data/davinci/{genre}_davinci.sampled.{est_name}.nll.txt', require_sid=True)
        df_circlemean_orig.to_csv(f'../data/davinci/{genre}_davinci.original.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)
        df_circlemean_samp.to_csv(f'../data/davinci/{genre}_davinci.sampled.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)

In [6]:
# Get circularmean data for bigram
genre_list = ['writing', 'pubmed', 'xsum']
est_name = 'bigram'

gpt4_dir = '../data/gpt-4/bigram'
chatgpt_dir = '../data/gpt-3.5/bigram'
davinci_dir = '../data/davinci/bigram'

for genre in genre_list:
    # gpt-4
    # df_circlemean_orig = get_circular_mean(f'{gpt4_dir}/{genre}_gpt-4.original.{est_name}.nll.txt', require_sid=True)
    # df_circlemean_samp = get_circular_mean(f'{gpt4_dir}/{genre}_gpt-4.sampled.{est_name}.nll.txt', require_sid=True)
    # df_circlemean_orig.to_csv(f'{gpt4_dir}/fftnorm/{genre}_gpt-4.original.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)
    # df_circlemean_samp.to_csv(f'{gpt4_dir}/fftnorm/{genre}_gpt-4.sampled.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)

    # chatgpt/gpt-3.5-turbo
    df_circlemean_orig = get_circular_mean(f'{chatgpt_dir}/{genre}_gpt-3.5-turbo.original.{est_name}.nll.txt', require_sid=True)
    df_circlemean_samp = get_circular_mean(f'{chatgpt_dir}/{genre}_gpt-3.5-turbo.sampled.{est_name}.nll.txt', require_sid=True)
    df_circlemean_orig.to_csv(f'{chatgpt_dir}/fftnorm/{genre}_gpt-3.5-turbo.original.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)
    df_circlemean_samp.to_csv(f'{chatgpt_dir}/fftnorm/{genre}_gpt-3.5-turbo.sampled.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)

    # davinci
    df_circlemean_orig = get_circular_mean(f'{davinci_dir}/{genre}_davinci.original.{est_name}.nll.txt', require_sid=True)
    df_circlemean_samp = get_circular_mean(f'{davinci_dir}/{genre}_davinci.sampled.{est_name}.nll.txt', require_sid=True)
    df_circlemean_orig.to_csv(f'{davinci_dir}/fftnorm/{genre}_davinci.original.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)
    df_circlemean_samp.to_csv(f'{davinci_dir}/fftnorm/{genre}_davinci.sampled.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)

In [9]:
# Get circularmean data for length chop = 50, 100, 150 data

genre_list = ['writing', 'xsum']
chop_k_list = [50, 100, 150]
est_name = 'gpt2xl'
data_dir = '../data/short'

for genre in genre_list:
    for chop_k in chop_k_list:
        df_circlemean_orig = get_circular_mean(f'{data_dir}/{genre}_gpt-4.original.{est_name}.chop{chop_k}.nll.txt', require_sid=True)
        df_circlemean_samp = get_circular_mean(f'{data_dir}/{genre}_gpt-4.sampled.{est_name}.chop{chop_k}.nll.txt', require_sid=True)
        df_circlemean_orig.to_csv(f'{data_dir}/{genre}_gpt-4.original.{est_name}.chop{chop_k}.nlllogzs.fftnorm.circlemean.txt', index=False)
        df_circlemean_samp.to_csv(f'{data_dir}/{genre}_gpt-4.sampled.{est_name}.chop{chop_k}.nlllogzs.fftnorm.circlemean.txt', index=False)

In [None]:
%%R -i df_circlemean_norm -i df_circlemean_real -i df_circlemean_imag
require(ggplot2)

genre <- "pubmed"
# est_name <- "gpt2xl"
est_name <- "mistral"

p <- ggplot(df_circlemean_norm, aes(x=freq, y=power, color=type)) + geom_smooth(method='gam') + 
    theme_bw() + theme(plot.title = element_text(hjust = 0.5, vjust=-12, size = 12)) +
    ggtitle(str_interp("PubMed: Human vs. GPT-4 \nNLL logzs, FFT norm, est ${est_name} \nCircular Mean")) +
    labs(x = bquote(omega[k]), y = bquote(X(omega[k])))
ggsave(str_interp("gpt4_human_${genre}_${est_name}_nlllogzs_fftnorm_circlemean.pdf"), plot=p, width=5, height=5)