In [1]:
import pandas as pd
from transformers import (
    AutoTokenizer, 
    DataCollatorForLanguageModeling
)
import os
import torch

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
import os
all_data = os.listdir("../../data/processed")

In [2]:
def token_mapper(x):
    try:
        return tokenizer.apply_chat_template(conversation=x, tokenize=False)
    except:
        return ""
def prepare_key(x):
    if "_new" in x:
        return x.replace("new_processed.jsonl", "v1.5_v1.6")
    else:
        return x.replace("new_processed.jsonl", "v0.9_v1.0")

### Prepare All Token Data

In [3]:
all_tokens: dict = {}
all_succesful_tokens: dict = {}


for fn in all_data:
    path = f"../../data/processed/{fn}"
    if os.path.isfile(path):
        key = prepare_key(fn)
        print(key)
        df = pd.read_json(path, lines=True)
        df["text"] = df.chat.map(lambda x: token_mapper(x))
        df["tokens"] = df.text.map(lambda x: len(tokenizer(x)["input_ids"]))
        all_tokens[key] = df.tokens.sum()

        # filter only successful tokens
        df = df[df.Success == 1]
        all_succesful_tokens[key] = df.tokens.sum()


total_tokens =sum([all_tokens[key] for key in all_tokens.keys()])
total__successful_tokens =sum([all_succesful_tokens[key] for key in all_succesful_tokens.keys()])        

privateshared_v1.5_v1.6
taboo_old_processed.jsonl
imagegame_old_processed.jsonl
wordle_withcritic_old_processed.jsonl
wordle_v1.5_v1.6
privateshared_old_processed.jsonl
DFINAL_VTrain_not_split.jsonl
wordle_withclue_old_processed.jsonl
wordle_old_processed.jsonl
referencegame_v1.5_v1.6
imagegame_v1.5_v1.6
wordle_withclue_v1.5_v1.6
referencegame_old_processed.jsonl
taboo_v1.5_v1.6
wordle_withcritic_v1.5_v1.6


### Prepare All Successful Episode Token Data

In [4]:
df = pd.read_json('../../data/processed/DFINAL_VTrain_not_split.jsonl', lines=True)
df["text"] = df.chat.map(lambda x: token_mapper(x))
df["tokens"] = df.text.map(lambda x: len(tokenizer(x)["input_ids"]))
all_tokens_dfinal = df.tokens.sum()


### Prepare All Training-Steps Token Data

In [25]:
def get_tokens_n_steps(steps: int, bs: int = 4):
    cut = steps * bs

    df = pd.read_json('../../data/processed/DFINAL_VTrain_not_split.jsonl', lines=True)
    dfinal = pd.read_csv("../../data/training_data/DFINAL_VTrain.csv")
    dfinal = dfinal.loc[0:cut]

    unique_ids = []
    for i, row in dfinal.iterrows():
        _id = row["model"] + row["game"] + row["experiment"] + row["episode"] + row["benchmark_version"]
        unique_ids.append(_id)

    unique_ids = list(set(unique_ids))
    ids_to_keep = []
    for i, row in df.iterrows():
        _id = row["model"] + row["game"] + row["experiment"] + row["episode"] + row["benchmark_version"]
        if _id in unique_ids:
            if row["game"] == "referencegame" and (row["benchmark_version"] == "v0.9" or row["benchmark_version"] == "v1.0"):
                continue
            else:
                ids_to_keep.append(i)

    df_only_dfinal = df.iloc[ids_to_keep]
    df_only_dfinal["text"] = df_only_dfinal.chat.map(lambda x: token_mapper(x))
    df_only_dfinal["tokens"] = df_only_dfinal.text.map(lambda x: len(tokenizer(x)["input_ids"]))

    return df_only_dfinal.tokens.sum()

In [28]:
all_tokens_0_7K_steps = get_tokens_n_steps(700, 4)
all_tokens_0_6K_steps = get_tokens_n_steps(600, 4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_only_dfinal["text"] = df_only_dfinal.chat.map(lambda x: token_mapper(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_only_dfinal["tokens"] = df_only_dfinal.text.map(lambda x: len(tokenizer(x)["input_ids"]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_only_dfinal["text"] = df_only_dfi

In [29]:
print(f"All Available Tokens All Episodes           : {total_tokens: > 20}")
print(f"All Available Tokens Sucessful Episodes     : {total__successful_tokens: > 20}")
print(f"All Available Tokens Final Dataset          : {all_tokens_dfinal: > 20}")
print(f"All Available Tokens Training 0.7K steps    : {all_tokens_0_7K_steps: > 20}")
print(f"All Available Tokens Training 0.6K steps    : {all_tokens_0_6K_steps: > 20}")

All Available Tokens All Episodes           :             65306233
All Available Tokens Sucessful Episodes     :             13322635
All Available Tokens Final Dataset          :              1484607
All Available Tokens Training 0.7K steps    :               623904
All Available Tokens Training 0.6K steps    :               561170


In [13]:
dfinal.head()

Unnamed: 0.1,Unnamed: 0,game,benchmark_version,game_id,model,experiment,episode,Aborted,Lose,Success,chat,text
0,0,wordle_withclue,v1.6,2,o1-preview-2024-09-12-t0.0--o1-preview-2024-09...,0_high_frequency_words_clue_no_critic,episode_1,0,0,1,"[{'role': 'user', 'content': 'You are a langua...",<|begin_of_text|><|start_header_id|>system<|en...
1,1,wordle_withclue,v1.6,2,o1-preview-2024-09-12-t0.0--o1-preview-2024-09...,0_high_frequency_words_clue_no_critic,episode_1,0,0,1,"[{'role': 'user', 'content': 'You are a langua...",<|begin_of_text|><|start_header_id|>system<|en...
2,2,wordle_withclue,v1.6,2,o1-preview-2024-09-12-t0.0--o1-preview-2024-09...,0_high_frequency_words_clue_no_critic,episode_1,0,0,1,"[{'role': 'user', 'content': 'You are a langua...",<|begin_of_text|><|start_header_id|>system<|en...
3,3,wordle_withcritic,v1.6,3,o1-preview-2024-09-12-t0.0--o1-preview-2024-09...,0_high_frequency_words_clue_with_critic,episode_2,0,0,1,"[{'role': 'user', 'content': 'You are a langua...",<|begin_of_text|><|start_header_id|>system<|en...
4,4,wordle_withcritic,v1.6,3,o1-preview-2024-09-12-t0.0--o1-preview-2024-09...,0_high_frequency_words_clue_with_critic,episode_2,0,0,1,"[{'role': 'user', 'content': 'You are a langua...",<|begin_of_text|><|start_header_id|>system<|en...
