In [1]:
### Set up directory
import sys
import os
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

# Check if directory exists, if not create it
save_path = os.path.join(parent_dir, 'processed_series')
if not os.path.exists(save_path):
    os.makedirs(save_path)
    
# for sub_dir in ["multiPDF_list", "logit_tensor"]:
#     sub_path = os.path.join(save_path, sub_dir)
#     if not os.path.exists(sub_path):
#         os.makedirs(sub_path)

from pathlib import Path
from tqdm import tqdm
import pickle
import torch
from models.llama import get_model_and_tokenizer
from models.ICL import MultiResolutionPDF, recursive_refiner, trim_kv_cache

continuous_series_names = [
                           'brownian_motion', 
                           'geometric_brownian_motion',
                           'noisy_logistic_map',
                           'uncorrelated_gaussian',
                           'uncorrelated_uniform'
                           ]
markov_chain_names = []



    

In [2]:
def calculate_multiPDF(full_series, prec, mode = 'neighbor', refine_depth = 1, llama_size = '13b'):
    model, tokenizer = get_model_and_tokenizer(llama_size)
    good_tokens_str = list("0123456789")
    good_tokens = [tokenizer.convert_tokens_to_ids(token) for token in good_tokens_str]
    assert refine_depth < prec, "Refine depth must be less than precision"
    refine_depth = refine_depth - prec
    curr = -prec
    batch = tokenizer(
        [full_series], 
        return_tensors="pt",
        add_special_tokens=True        
    )
    torch.cuda.empty_cache()
    with torch.no_grad():
        # out = model(batch['input_ids'].cuda(), use_cache=True)
        out = model(batch['input_ids'].cpu(), use_cache=True)
    logit_mat = out['logits'] 
    kv_cache_main = out['past_key_values']
    logit_mat_good = logit_mat[:,:,good_tokens].clone()
    probs = torch.nn.functional.softmax(logit_mat_good[:,1:,:], dim=-1)
    
    PDF_list = []
    comma_locations = np.sort(np.where(np.array(list(full_series)) == ',')[0])

    for i in tqdm(range(len(comma_locations))):
        PDF = MultiResolutionPDF()
        # slice out the number before ith comma
        if i == 0:
            start_idx = 0
        else:
            start_idx = comma_locations[i-1]+1
        end_idx = comma_locations[i]
        num_slice = full_series[start_idx:end_idx]
        prob_slice = probs[0,start_idx:end_idx].cpu().numpy()
        ### Load hierarchical PDF 
        PDF.load_from_num_prob(num_slice, prob_slice)
        
        ### Refine hierarchical PDF
        seq = full_series[:end_idx]
        # cache and full_series are shifted from beginning, not end
        end_idx_neg = end_idx - len(full_series)
        ### kv cache contains seq[0:-1]
        kv_cache = trim_kv_cache(kv_cache_main, end_idx_neg-1)
        recursive_refiner(PDF, seq, curr = curr, main = True, refine_depth = refine_depth, mode = "neighbor", 
                        kv_cache = kv_cache, model = model, tokenizer = tokenizer, good_tokens=good_tokens)

        PDF_list += [PDF]
        return PDF_list
    

In [3]:
continuous_series_names = [
                           'brownian_motion', 
                           'geometric_brownian_motion',
                           'noisy_logistic_map',
                           'uncorrelated_gaussian',
                           'uncorrelated_uniform'
                           ]
markov_chain_names = []

# Define the directory where the generated series are stored
generated_series_dir = Path(parent_dir) / 'generated_series'
processed_series_dir = Path(parent_dir) / 'processed_series'

# Initialize dictionaries to store the data for continuous series and Markov chains
continuous_series_data = {}
markov_chain_data = {}

# Loop through each file in the directory
for file in generated_series_dir.iterdir():
    # Check that a file of the same name does not appear in the processed_series_dir
    if not (processed_series_dir / file.name).exists():
        # Extract the series name from the file name
        series_name = file.stem.rsplit('_', 1)[0]
        # If the series is a continuous series, load the data into the continuous_series_data dictionary
        if series_name in continuous_series_names:
            continuous_series_data[file.name] = pickle.load(file.open('rb'))
        # If the series is a Markov chain, load the data into the markov_chain_data dictionary
        elif series_name in markov_chain_names:
            markov_chain_data[file.name] = pickle.load(file.open('rb'))
        # If the series name is not recognized, raise an exception
        else:
            raise Exception(f"Unrecognized series name: {series_name}")
        


In [4]:
series_name = 'noisy_logistic_map_3.pkl'
series_dict = continuous_series_data[series_name]

full_series = series_dict['full_series']
rescaled_true_mean_arr = series_dict['rescaled_true_mean_arr']
rescaled_true_sigma_arr = series_dict['rescaled_true_sigma_arr']
prec = series_dict['prec']
refine_depth = series_dict['refine_depth']
llama_size = series_dict['llama_size']
mode = series_dict['mode']
calculate_multiPDF(full_series, prec, mode = mode, refine_depth = 1, llama_size = llama_size)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.34 GiB (GPU 0; 15.73 GiB total capacity; 6.51 GiB already allocated; 1.25 GiB free; 6.55 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF