In [2]:
import json
import argparse
from os import listdir, mkdir
from os.path import isfile, join, exists
import math

from pymo.preprocessing import *
from sklearn.pipeline import Pipeline
from pymo.parsers import BVHParser
from pymo.viz_tools import *

import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [28]:
motion_dir = "../data/GENEA_2022/val/bvh/"
transcript_dir = "../data/GENEA_2022/val/tsv/"
transcript_out_dir = "../data/GENEA_2022/val/json/"

In [29]:
recording_files = [f for f in listdir(
    motion_dir) if isfile(join(motion_dir, f)) and f[-3:] == "bvh"]
recording_files.sort()
motion_files = [join(motion_dir, recording_files[i]) for i in range(len(recording_files))]
transcript_files = [join(transcript_dir, recording_files[i].replace(
    'bvh', 'tsv')) for i in range(len(recording_files))]

In [30]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# tokenizer.pad_token = tokenizer.eos_token

In [31]:
clip_size = 5 # s
words_per_half_second = 15
num_offsets = 10

In [32]:
def sectionize(transcript_files, clip_size, start_offset=0):
  all_transcripts = []
  for file_id in range(len(transcript_files)):
    try:
      transcript = pd.read_csv(transcript_files[file_id], sep='\t', header=None)
    except pd.errors.EmptyDataError:
      print(f"File {transcript_files[file_id]} was empty.")
      all_transcripts.append([])
      continue
    
    # Sectionize the words
    if not exists(transcript_out_dir):
      mkdir(transcript_out_dir)
    
    words_sectioned = []
    current_clip_start = start_offset
    words_counter = 0
    max_tokens_per_section = 0

    # Set the sections empty until there the first word occurance
    first_detected_word_start = transcript.iloc[words_counter, 0]
    while first_detected_word_start > current_clip_start + clip_size:
      words_sectioned += [{"start": current_clip_start,
                          "end": current_clip_start + clip_size, 
                          "indices": [],
                          "section_text": ""}]
      current_clip_start += clip_size

    while words_counter < transcript.shape[0]:
      # Add all words in the section to a list
      words_in_section = []
      word_indices_in_section = []
      current_index = 0
      current_section = 0

      while words_counter < transcript.shape[0] and transcript.iloc[words_counter, 0] < current_clip_start + clip_size:
        word_with_possible_punctuation = transcript.iloc[words_counter, 2]
        if not isinstance(word_with_possible_punctuation, str):
          word_with_possible_punctuation = 'uh'
        words_in_section.append(word_with_possible_punctuation)
        words_tokenized = tokenizer.tokenize(word_with_possible_punctuation)
        num_tokens = len(words_tokenized)
        assert num_tokens == len(tokenizer(word_with_possible_punctuation)['input_ids']) - 2, \
          f"{word_with_possible_punctuation}, {num_tokens}, {words_tokenized}, {tokenizer(word_with_possible_punctuation)}"
        section_within_section = np.floor((transcript.iloc[words_counter, 0] - current_clip_start) * 2).astype('int32')
        if section_within_section > current_section:
          current_section = int(section_within_section)
          current_index = current_section * words_per_half_second
        
        for token in words_tokenized:
          word_indices_in_section.append(current_index)
          current_index += 1
          words_counter += 1
      
      section_text = " ".join(words_in_section)
      assert len(tokenizer.tokenize(section_text)) == len(word_indices_in_section), \
        f"{word_indices_in_section}, {tokenizer.tokenize(section_text)}, {section_text}"
      max_tokens_per_section = max(
          max_tokens_per_section, len(word_indices_in_section))
      words_sectioned += [{"start": current_clip_start,
                          "end": current_clip_start + clip_size, 
                          "indices": word_indices_in_section, 
                          "section_text": section_text}]
      current_clip_start += clip_size
    all_transcripts.append(words_sectioned)
  return all_transcripts


In [33]:
all_transcripts = [sectionize(transcript_files, clip_size, start_offset=i / 2) for i in range(num_offsets)]

In [178]:
word_counts = []
for i in range(len(all_transcripts)):
  offset = all_transcripts[i]
  for j in range(len(offset)):
    transcript = offset[j]
    for k in range(len(transcript)):
      section = transcript[k]
      word_counts.append(len(section["indices"]))
print(f"Max words in a section: {max(word_counts)}")

Max words in a section: 45


In [179]:
for i in range(num_offsets):
  with open(join(transcript_out_dir, f"text_{clip_size}s_offset_{i}_half_s.json"), "w") as f:
    json.dump(all_transcripts[i], f)

In [24]:
def check(transcript_files, clip_size, start_offset=0):
  all_transcripts = []
  max_counts = []
  for file_id in range(len(transcript_files)):
    try:
      transcript = pd.read_csv(transcript_files[file_id], sep='\t', header=None)
    except pd.errors.EmptyDataError:
      print(f"File {transcript_files[file_id]} was empty.")
      all_transcripts.append([])
      continue
    
    words_sectioned = []
    current_clip_start = start_offset
    words_counter = 0

    word_starts = np.floor(np.array(transcript.iloc[:, 0]) * 2).astype('int32')
    word_starts_tokenized = []
    for i in range(len(word_starts)):
      text = transcript.iloc[i, 2]
      if not isinstance(text, str):
        text = "uh"
      word_starts_tokenized += [word_starts[i]] * len(tokenizer.tokenize(text))
    
    word_starts_tokenized_np = np.array(word_starts_tokenized)
    unique, counts = np.unique(word_starts_tokenized_np, return_counts=True)
    max_counts.append(np.max(counts))
  print(max(max_counts), np.argmax(max_counts))
  print(sorted(max_counts))
   
check(transcript_files, clip_size, start_offset=0)


14 277
[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 14]


In [156]:
len(tokenizer("a")['input_ids'])

3

In [121]:
[tokenizer(i) for i in "happen person when they have like some people have a perfectly life with no nothing wrong.".split(" ")]

[{'input_ids': [101, 4148, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 2711, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 2043, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 2027, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 2031, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 2066, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 2070, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 2111, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 2031, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 1037, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]},
 {'input_ids': [101, 6669, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': 

In [122]:
tokenizer("happen person when they have like some people have a perfectly life with no nothing wrong.")

{'input_ids': [101, 4148, 2711, 2043, 2027, 2031, 2066, 2070, 2111, 2031, 1037, 6669, 2166, 2007, 2053, 2498, 3308, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [117]:
tokenizer("no nothing")

{'input_ids': [3919, 2147], 'attention_mask': [1, 1]}

In [119]:
tokenizer.decode([71, 1324, 268, 1048, 618, 484, 423, 588, 617, 661, 423, 257, 7138, 1204, 351, 645, 2147, 2642, 13])

'happen'

In [120]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer.tokenize("I have a new GPU!")

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 687kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 6.38kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 126kB/s]


['i', 'have', 'a', 'new', 'gp', '##u', '!']