In [2]:
import pandas as pd
import numpy as np
import torch
from onehotencoder import OneHotEncoder
from typing import List
from collections import Counter

In [3]:
def count_sequence_lengths(
    seq_filepath: str,
    token_list: list[str],
    max_len: int = 57,
    allow_unknown: bool = False
) -> dict[int, int]:

    tokens_sorted = sorted(token_list, key=len, reverse=True)
    counts = Counter()

    with open(seq_filepath, 'r', encoding='utf-8') as f:
        for lineno, line in enumerate(f, 1):
            seq = line.rstrip('\n')
            i = 0
            tokenized = []
            while i < len(seq):
                for tok in tokens_sorted:
                    if seq.startswith(tok, i):
                        tokenized.append(tok)
                        i += len(tok)
                        break
                else:
                    if allow_unknown:
                        # emit single-char as fallback
                        tokenized.append(seq[i])
                        i += 1
                    else:
                        raise ValueError(
                            f"Unknown token at line {lineno}, position {i}: {seq[i:]!r}"
                        )

            L = len(tokenized)
            if 1 <= L <= max_len:
                counts[L] += 1

    return {length: counts.get(length, 0) for length in range(1, max_len + 1)}


import time
from typing import List, Dict, Optional

class TokenTrieNode:
    __slots__ = ("children", "token_end")
    def __init__(self):
        self.children: Dict[str, TokenTrieNode] = {}
        self.token_end: Optional[str] = None

def build_token_trie(tokens: List[str]) -> TokenTrieNode:
    root = TokenTrieNode()
    for tok in tokens:
        node = root
        for ch in tok:
            node = node.children.setdefault(ch, TokenTrieNode())
        node.token_end = tok
    return root

def tokenize_sequence(seq: str, trie: TokenTrieNode, allow_unknown: bool=False) -> List[str]:
    tokens = []
    i = 0
    n = len(seq)
    while i < n:
        node = trie
        last_match: Optional[str] = None
        last_pos = i
        j = i
        while j < n and seq[j] in node.children:
            node = node.children[seq[j]]
            j += 1
            if node.token_end:
                last_match = node.token_end
                last_pos = j
        if last_match:
            tokens.append(last_match)
            i = last_pos
        else:
            if allow_unknown:
                tokens.append(seq[i])
                i += 1
            else:
                raise ValueError(f"Unknown token at pos {i} of {seq!r}")
    return tokens

def filter_sequences_by_token_length(
    input_path: str,
    token_list: List[str],
    target_len: int,
    output_path: str,
    allow_unknown: bool = False
) -> None:

    trie = build_token_trie(token_list)
    processed = 0
    matched = 0
    start = time.time()

    with open(input_path, "r", encoding="utf-8") as fin, \
         open(output_path, "w", encoding="utf-8") as fout:
        for line in fin:
            seq = line.strip()
            if not seq:
                continue
            processed += 1
            try:
                toks = tokenize_sequence(seq, trie, allow_unknown)
            except ValueError as e:
                # you can log or skip; here we skip any bad lines
                # print(f"Skipping line {processed}: {e}")
                continue

            if len(toks) == target_len:
                fout.write(seq + "\n")
                matched += 1

    elapsed = time.time() - start
    print(
        f"Processed {processed} lines in {elapsed:.2f}s, "
        f"wrote {matched} sequences of token‐length {target_len} to {output_path}"
    )

In [4]:
filepath = "data/test.csv"
token_list = ['Br', 'N', ')', 'c', 'o', '6', 's', 'Cl', '=', '2', ']', 'C', 'n', 'O', '4', '1', '#', 'S', 'F', '3', '[', '5', 'H', '(', '-', '[BOS]', '[EOS]', '[PAD]']
valid_tokens = set(token_list)

length_counts = count_sequence_lengths(
        seq_filepath=filepath,
        token_list=token_list,
        max_len=57,
        allow_unknown=False
    )

for length, cnt in length_counts.items():
    print(f"Length {length:2d}: {cnt}")

Length  1: 0
Length  2: 0
Length  3: 0
Length  4: 0
Length  5: 0
Length  6: 0
Length  7: 0
Length  8: 0
Length  9: 0
Length 10: 0
Length 11: 0
Length 12: 0
Length 13: 1
Length 14: 1
Length 15: 2
Length 16: 9
Length 17: 5
Length 18: 21
Length 19: 27
Length 20: 70
Length 21: 156
Length 22: 190
Length 23: 351
Length 24: 555
Length 25: 979
Length 26: 1710
Length 27: 2887
Length 28: 4413
Length 29: 5726
Length 30: 7897
Length 31: 9331
Length 32: 10674
Length 33: 11809
Length 34: 12825
Length 35: 13118
Length 36: 13851
Length 37: 14023
Length 38: 13764
Length 39: 12800
Length 40: 11388
Length 41: 9134
Length 42: 6683
Length 43: 4904
Length 44: 3005
Length 45: 1801
Length 46: 928
Length 47: 539
Length 48: 274
Length 49: 119
Length 50: 57
Length 51: 33
Length 52: 10
Length 53: 2
Length 54: 1
Length 55: 1
Length 56: 0
Length 57: 0


In [6]:
for i in (list(range(22,49))):
    target_length = i
    out_file = f"data/seqs_len{i}_test.txt"
    filepath = "data/train.csv"
    token_list = ['Br', 'N', ')', 'c', 'o', '6', 's', 'Cl', '=', '2', ']', 'C', 'n', 'O', '4', '1', '#', 'S', 'F', '3', '[', '5', 'H', '(', '-', '[BOS]', '[EOS]', '[PAD]']
    filter_sequences_by_token_length(
    input_path=filepath,
    token_list=token_list,
    target_len=target_length,
    output_path=out_file,
    allow_unknown=False
    )

Processed 1584663 lines in 6.25s, wrote 1792 sequences of token‐length 22 to data/seqs_len22_test.txt
Processed 1584663 lines in 6.25s, wrote 3247 sequences of token‐length 23 to data/seqs_len23_test.txt
Processed 1584663 lines in 6.20s, wrote 5108 sequences of token‐length 24 to data/seqs_len24_test.txt
Processed 1584663 lines in 6.35s, wrote 9155 sequences of token‐length 25 to data/seqs_len25_test.txt
Processed 1584663 lines in 6.27s, wrote 15152 sequences of token‐length 26 to data/seqs_len26_test.txt
Processed 1584663 lines in 6.42s, wrote 25381 sequences of token‐length 27 to data/seqs_len27_test.txt
Processed 1584663 lines in 6.23s, wrote 38700 sequences of token‐length 28 to data/seqs_len28_test.txt
Processed 1584663 lines in 6.18s, wrote 53108 sequences of token‐length 29 to data/seqs_len29_test.txt
Processed 1584663 lines in 6.22s, wrote 71316 sequences of token‐length 30 to data/seqs_len30_test.txt
Processed 1584663 lines in 6.15s, wrote 84954 sequences of token‐length 31 to