In [3]:
import pandas as pd
import os
import pickle
from lrs import longestRepeatedSublist
from mrs import find_most_occuring_substring
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
files = os.listdir('../data')
# choose files without 'lrs' or 'mrs' in the name
files = [file for file in files if 'lrs' not in file and 'mrs' not in file]
files

['pythia-70m.pkl',
 'pythia-160m.pkl',
 'pythia-410m.pkl',
 'pythia-1b.pkl',
 'pythia-1.4b.pkl',
 'pythia-2.8b.pkl',
 'pythia-6.9b.pkl',
 'pythia-70m-deduped.pkl',
 'pythia-160m-deduped.pkl',
 'pythia-410m-deduped.pkl',
 'pythia-1b-deduped.pkl',
 'pythia-1.4b-deduped.pkl',
 'pythia-2.8b-deduped.pkl',
 'pythia-6.9b-deduped.pkl']

In [5]:
def get_df_with_lrs(file):
    df = pickle.load(open('../data/' + file, 'rb'))
    df['lrs'] = df['tokens'].parallel_apply(longestRepeatedSublist)
    df['lrs_len'] = df['lrs'].parallel_apply(len)
    # save the df as a pickle file
    pickle.dump(df, open('../data/' + file + '_with_lrs', 'wb'))
    return df

In [10]:
def preprocess_tokens(toks):
    str_arr = toks.astype(str)
    str_lst = ''.join(str_arr)
    str_lst = str_lst.replace(' ', '').replace(',', '')
    return str_lst

In [24]:
def get_df_with_mrs(file):
    df = pickle.load(open('../data/' + file, 'rb'))
    df['preprocessed_tokens'] = df['tokens'].parallel_apply(preprocess_tokens)
    df['mrs'] = df['preprocessed_tokens'].parallel_apply(find_most_occuring_substring)
    df['mrs_len'] = df['mrs'].parallel_apply(len)
    # # save the df as a pickle file
    pickle.dump(df, open('../data/' + file + '_with_mrs', 'wb'))
    return df

In [25]:
def plot_and_save_lrs(df, file):
    plt.figure(figsize=(10, 8))
    sns.distplot(df['lrs_len'], kde=False)
    plt.title('Distribution of LRS Lengths for ' + file)
    plt.xlabel('LRS Length')
    plt.ylabel('Count')
    plt.savefig('../plots/' + file + '_lrs_dist.png')

In [26]:
def plot_and_save_mrs(df, file):
    plt.figure(figsize=(10, 8))
    sns.distplot(df['mrs_len'], kde=False)
    plt.title('Distribution of MRS Lengths for ' + file)
    plt.xlabel('MRS Length')
    plt.ylabel('Count')
    plt.savefig('../plots/' + file + '_mrs_dist.png')

In [27]:
# for file in files:
#     df = get_df_with_lrs(file)
#     plot_and_save_lrs(df, file)

In [12]:
# Stitch together the plots into a pdf
import glob
from fpdf import FPDF
from PIL import Image

pdf = FPDF()
# imagelist is the list with all image filenames
files = glob.glob('../plots/*.png')
# choose files with lrs in the name
files = [file for file in files if 'lrs' in file]
for image in files:
    pdf.add_page()
    pdf.image(image, 0, 0, 210, 297)
pdf.output("../plots/lrs_dists.pdf", "F")

''

In [None]:
# Stitch together the plots into a pdf
import glob
from fpdf import FPDF
from PIL import Image

pdf = FPDF()
# imagelist is the list with all image filenames
files = glob.glob('../plots/*.png')
# choose files with mrs in the name
files = [file for file in files if 'mrs' in file]
for image in files:
    pdf.add_page()
    pdf.image(image, 0, 0, 210, 297)
pdf.output("../plots/mrs_dists.pdf", "F")