In [29]:
import sys
sys.path.append('..')
import glob
import os

from run_fft import FFTProcessor
fft_processor = FFTProcessor(method='fft', preprocess='none', value='norm', require_sid=True)

In [None]:
import rpy2
%load_ext rpy2.ipython

%R require("data.table")
%R require("ggplot2")
%R require("stringr")
%R require("fpp2") # https://github.com/robjhyndman/fpp2-package, required for adf.test

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


0
1


A simple example to showcase the stationarity test

In [6]:
%%R 

# Example on a random data
x <- rnorm(1000)

defaultW <- getOption("warn")
options(warn = -1) # suppress warnings

adfpval <- adf.test(x) # Augmented Dickey-Fuller test

options(warn = defaultW) # restore warnings
print(adfpval)

# Expected output:
# Dickey-Fuller = (some value around -10), Lag order = 9, p-value = 0.01
# alternative hypothesis: stationary

# If warnings are not supressed, you will see
# In adf.test(x) : p-value smaller than printed p-value

# Conclusion: reject the null hypothesis, which means the series is stationary


	Augmented Dickey-Fuller Test

data:  x
Dickey-Fuller = -11.065, Lag order = 9, p-value = 0.01
alternative hypothesis: stationary



Prepare data for stationarity test

In [30]:
def load_df_from_dir(dir_path):
    data_files = glob.glob(os.path.join(dir_path, '*.nll'))
    nll = []
    for file in data_files:
        nll.extend(fft_processor._read_data(file))
    return fft_processor._create_input_df(nll)

In [33]:
# Load GPT2 data
df_gpt2_news = load_df_from_dir('../data/gpt2/news_split')
df_gpt2_story = load_df_from_dir('../data/gpt2/story_split')
df_gpt2_wiki = load_df_from_dir('../data/gpt2/wiki_split')

df_gpt2xl_news = load_df_from_dir('../data/gpt2-xl/news_split')
df_gpt2xl_story = load_df_from_dir('../data/gpt2-xl/story_split')
df_gpt2xl_wiki = load_df_from_dir('../data/gpt2-xl/wiki_split')

In [44]:
# Load BLOOM data
df_bloom_sm_news = load_df_from_dir('../data/bloom-560m/news_split')
df_bloom_sm_story = load_df_from_dir('../data/bloom-560m/story_split')
df_bloom_sm_wiki = load_df_from_dir('../data/bloom-560m/wiki_split')

df_bloom_lg_news = load_df_from_dir('../data/bloom-7b/news_split')
df_bloom_lg_story = load_df_from_dir('../data/bloom-7b/story_split')
df_bloom_lg_wiki = load_df_from_dir('../data/bloom-7b/wiki_split')

In [49]:
# Load OPT data
df_opt_sm_news = load_df_from_dir('../data/opt-125m/news_split')
df_opt_sm_story = load_df_from_dir('../data/opt-125m/story_split')
df_opt_sm_wiki = load_df_from_dir('../data/opt-125m/wiki_split')

df_opt_lg_news = load_df_from_dir('../data/opt-6.7b/news_split')
df_opt_lg_story = load_df_from_dir('../data/opt-6.7b/story_split')
df_opt_lg_wiki = load_df_from_dir('../data/opt-6.7b/wiki_split')

Conduct tests

In [41]:
%%R 

# The function to carry out adf.test for each series in a data.table
ADF_test_DT <- function(dt) {
  dt.test <- data.table(series_id = numeric(),
                        series_len = numeric(),
                        adfpval = numeric()
  )
  # Suppress warning
  defaultW <- getOption("warn")
  options(warn = -1)
  unique_series_ids <- unique(dt$sid)
  for (i in 1:length(unique_series_ids)) {
    s_id <- unique_series_ids[i]
    value <- dt[sid == s_id]$value
    if (length(value) < 10) {next}
    adfpval <- adf.test(value)$p.value # Augmented Dickey-Fuller test
    tmp <- data.table(series_id = s_id,
                      series_len = length(value),
                      adfpval = adfpval
    )
    dt.test <- rbindlist(list(dt.test, tmp))
    # slow, so print progress
    # if (i %% 500 == 0) {
    #   write(paste0("\rFinished ", i, " out of ", length(unique_series_ids)), stdout())
    # }
  }
  # Restore warning
  options(warn = defaultW)
  dt.test
}

In [42]:
# Test if the function is loaded 
rpy2.robjects.globalenv['ADF_test_DT']

<rpy2.robjects.functions.SignatureTranslatedFunction object at 0x4263db8d0> [3]
R classes: ('function',)

In [47]:
%%R -i df_bloom_sm_news -i df_bloom_sm_wiki -i df_bloom_sm_story -i df_bloom_lg_news -i df_bloom_lg_wiki -i df_bloom_lg_story

# BLOOM-small
dt.bloom.sm.news <- data.table(df_bloom_sm_news)
dt.bloom.sm.news.test <- ADF_test_DT(dt.bloom.sm.news)
prop <- nrow(dt.bloom.sm.news.test[adfpval < 0.05]) / nrow(dt.bloom.sm.news.test)
print(str_interp("BLOOM-small news proportion: ${prop}")) # This the proportion of series that pass the test

dt.bloom.sm.wiki <- data.table(df_bloom_sm_wiki)
dt.bloom.sm.wiki.test <- ADF_test_DT(dt.bloom.sm.wiki)
prop <- nrow(dt.bloom.sm.wiki.test[adfpval < 0.05]) / nrow(dt.bloom.sm.wiki.test)
print(str_interp("BLOOM-small wiki proportion: ${prop}"))

dt.bloom.sm.story <- data.table(df_bloom_sm_story)
dt.bloom.sm.story.test <- ADF_test_DT(dt.bloom.sm.story)
prop <- nrow(dt.bloom.sm.story.test[adfpval < 0.05]) / nrow(dt.bloom.sm.story.test)
print(str_interp("BLOOM-small story proportion: ${prop}"))

# BLOOM-large
dt.bloom.lg.news <- data.table(df_bloom_lg_news)
dt.bloom.lg.news.test <- ADF_test_DT(dt.bloom.lg.news)
prop <- nrow(dt.bloom.lg.news.test[adfpval < 0.05]) / nrow(dt.bloom.lg.news.test)
print(str_interp("BLOOM-large news proportion: ${prop}"))

dt.bloom.lg.wiki <- data.table(df_bloom_lg_wiki)
dt.bloom.lg.wiki.test <- ADF_test_DT(dt.bloom.lg.wiki)
prop <- nrow(dt.bloom.lg.wiki.test[adfpval < 0.05]) / nrow(dt.bloom.lg.wiki.test)
print(str_interp("BLOOM-large wiki proportion: ${prop}"))

dt.bloom.lg.story <- data.table(df_bloom_lg_story)
dt.bloom.lg.story.test <- ADF_test_DT(dt.bloom.lg.story)
prop <- nrow(dt.bloom.lg.story.test[adfpval < 0.05]) / nrow(dt.bloom.lg.story.test)
print(str_interp("BLOOM-large story proportion: ${prop}"))

[1] "BLOOM-small news proportion: 0.6042"
[1] "BLOOM-small wiki proportion: 0.6104"
[1] "BLOOM-small story proportion: 0.430241935483871"
[1] "BLOOM-large news proportion: 0.9302"
[1] "BLOOM-large wiki proportion: 0.9656"
[1] "BLOOM-large story proportion: 0.904408817635271"


In [48]:
%%R -i df_gpt2_news -i df_gpt2_wiki -i df_gpt2_story -i df_gpt2xl_news -i df_gpt2xl_wiki -i df_gpt2xl_story

# GPT2
dt.gpt2.news <- data.table(df_gpt2_news)
dt.gpt2.news.test <- ADF_test_DT(dt.gpt2.news)
prop <- nrow(dt.gpt2.news.test[adfpval < 0.05]) / nrow(dt.gpt2.news.test)
print(str_interp("GPT2 news proportion: ${prop}"))

dt.gpt2.wiki <- data.table(df_gpt2_wiki)
dt.gpt2.wiki.test <- ADF_test_DT(dt.gpt2.wiki)
prop <- nrow(dt.gpt2.wiki.test[adfpval < 0.05]) / nrow(dt.gpt2.wiki.test)
print(str_interp("GPT2 wiki proportion: ${prop}"))

dt.gpt2.story <- data.table(df_gpt2_story)
dt.gpt2.story.test <- ADF_test_DT(dt.gpt2.story)
prop <- nrow(dt.gpt2.story.test[adfpval < 0.05]) / nrow(dt.gpt2.story.test)
print(str_interp("GPT2 story proportion: ${prop}"))

# GPT2-xl
dt.gpt2xl.news <- data.table(df_gpt2xl_news)
dt.gpt2xl.news.test <- ADF_test_DT(dt.gpt2xl.news)
prop <- nrow(dt.gpt2xl.news.test[adfpval < 0.05]) / nrow(dt.gpt2xl.news.test)
print(str_interp("GPT2-xl news proportion: ${prop}"))

dt.gpt2xl.wiki <- data.table(df_gpt2xl_wiki)
dt.gpt2xl.wiki.test <- ADF_test_DT(dt.gpt2xl.wiki)
prop <- nrow(dt.gpt2xl.wiki.test[adfpval < 0.05]) / nrow(dt.gpt2xl.wiki.test)
print(str_interp("GPT2-xl wiki proportion: ${prop}"))

dt.gpt2xl.story <- data.table(df_gpt2xl_story)
dt.gpt2xl.story.test <- ADF_test_DT(dt.gpt2xl.story)
prop <- nrow(dt.gpt2xl.story.test[adfpval < 0.05]) / nrow(dt.gpt2xl.story.test)
print(str_interp("GPT2-xl story proportion: ${prop}"))

[1] "GPT2 news proportion: 0.9682"
[1] "GPT2 wiki proportion: 0.9778"
[1] "GPT2 story proportion: 0.9718"
[1] "GPT2-xl news proportion: 0.975"
[1] "GPT2-xl wiki proportion: 0.9814"
[1] "GPT2-xl story proportion: 0.9684"


In [50]:
%%R -i df_opt_sm_news -i df_opt_sm_wiki -i df_opt_sm_story -i df_opt_lg_news -i df_opt_lg_wiki -i df_opt_lg_story

# OPT-small
dt.opt.sm.news <- data.table(df_opt_sm_news)
dt.opt.sm.news.test <- ADF_test_DT(dt.opt.sm.news)
prop <- nrow(dt.opt.sm.news.test[adfpval < 0.05]) / nrow(dt.opt.sm.news.test)
print(str_interp("OPT-small news proportion: ${prop}"))

dt.opt.sm.wiki <- data.table(df_opt_sm_wiki)
dt.opt.sm.wiki.test <- ADF_test_DT(dt.opt.sm.wiki)
prop <- nrow(dt.opt.sm.wiki.test[adfpval < 0.05]) / nrow(dt.opt.sm.wiki.test)
print(str_interp("OPT-small wiki proportion: ${prop}"))

dt.opt.sm.story <- data.table(df_opt_sm_story)
dt.opt.sm.story.test <- ADF_test_DT(dt.opt.sm.story)
prop <- nrow(dt.opt.sm.story.test[adfpval < 0.05]) / nrow(dt.opt.sm.story.test)
print(str_interp("OPT-small story proportion: ${prop}"))

# OPT-large
dt.opt.lg.news <- data.table(df_opt_lg_news)
dt.opt.lg.news.test <- ADF_test_DT(dt.opt.lg.news)
prop <- nrow(dt.opt.lg.news.test[adfpval < 0.05]) / nrow(dt.opt.lg.news.test)
print(str_interp("OPT-large news proportion: ${prop}"))

dt.opt.lg.wiki <- data.table(df_opt_lg_wiki)
dt.opt.lg.wiki.test <- ADF_test_DT(dt.opt.lg.wiki)
prop <- nrow(dt.opt.lg.wiki.test[adfpval < 0.05]) / nrow(dt.opt.lg.wiki.test)
print(str_interp("OPT-large wiki proportion: ${prop}"))

dt.opt.lg.story <- data.table(df_opt_lg_story)
dt.opt.lg.story.test <- ADF_test_DT(dt.opt.lg.story)
prop <- nrow(dt.opt.lg.story.test[adfpval < 0.05]) / nrow(dt.opt.lg.story.test)
print(str_interp("OPT-large story proportion: ${prop}"))

[1] "OPT-small news proportion: 0.9856"
[1] "OPT-small wiki proportion: 0.9656"
[1] "OPT-small story proportion: 0.846"
[1] "OPT-large news proportion: 0.9794"
[1] "OPT-large wiki proportion: 0.957"
[1] "OPT-large story proportion: 0.7924"
