In [1]:
import transformers

def count_unique_tokens(text_file_path):
    """Counts unique tokens in a text file using the AutoTokenizer function from the Transformers library.

    Args:
        text_file_path: The path to the text file.

    Returns:
        A list of tuples, where each tuple contains a token and its count. The list is sorted in descending order by count.
    """

    # Load the AutoTokenizer
    tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

    # Initialize an empty dictionary to store token counts
    token_counts = {}

    # Open the file and process it line by line to reduce memory usage
    with open(text_file_path, "r") as f:
        for line in f:
            # Tokenize the current line
            tokens = tokenizer.tokenize(line)

            # Update the token counts
            for token in tokens:
                token_counts[token] = token_counts.get(token, 0) + 1

    # Sort the token counts by frequency in descending order
    sorted_token_counts = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)

    return sorted_token_counts[:30]

# Example usage
text_file_path = "/content/all_combined_texts.txt"
top_30_tokens = count_unique_tokens(text_file_path)

# Print the top 30 tokens
for token, count in top_30_tokens:
    print(f"{token}: {count}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


*: 12214251
.: 8947097
-: 6203884
:: 4112220
,: 3666144
[: 2778940
]: 2778846
): 2241033
(: 2194265
the: 2135417
and: 1873788
to: 1620980
of: 1572498
was: 1471942
1: 1325148
with: 1123084
/: 1083591
2: 1066626
a: 1061508
in: 1045065
on: 1008914
name: 860545
for: 817392
3: 741510
no: 707852
##s: 707329
5: 701618
mg: 665025
##g: 649896
4: 644311
