In [11]:
import pandas as pd

In [12]:
def preprocessing(text):
    preprocessed_text = text.lower()
    preprocessed_text = " ".join("".join(c if c.isalpha() else ' ' for c in word) for word in preprocessed_text.split())
    preprocessed_text = " ".join(preprocessed_text.split())
    return preprocessed_text

In [13]:
df = pd.read_csv("corpus.txt", header=None, names=["text"])
text = df['text'].tolist()
preprocessed_text = [preprocessing(sentence) for sentence in text]
split_text = [sentence.split() for sentence in preprocessed_text]
corpus = list(set([word for sentence in split_text for word in sentence]))
split_corpus = [[word[0]] + ['##' + char for char in word[1:]] for word in corpus]
vocabulary = list(set([char for split in split_corpus for char in split]))
vocabulary_size = 1000

In [14]:
def score(pairs, tokens, pair):
    (token1, token2) = pair
    return (pairs[pair] / (tokens[token1] * tokens[token2]))

In [15]:
for _ in range(vocabulary_size):
    pairs = {}
    tokens = {}
    for split in split_corpus:
        for token in split:
            tokens[token] = tokens.get(token, 0) + 1
            tokens[token] += 1

    for split in split_corpus:
        for idx in range(len(split) - 1):
            pair = (split[idx], split[idx+1])
            pairs[pair] = pairs.get(pair, 0) + 1
            pairs[pair] += 1

    if(len(pairs) == 0):
        break
    
    max_pair = max(pairs, key=lambda pair: score(pairs, tokens, pair))
    max_token = max_pair[0] + max_pair[1][2:]
    vocabulary.append(max_token)

    updated_split_corpus = []
    for split in split_corpus:
        updated_split = []
        idx = 0
        while idx < len(split):
            if idx+1 < len(split) and split[idx] == max_pair[0] and split[idx+1] == max_pair[1]:
                updated_split.append(max_token)
                idx += 2
            else:
                updated_split.append(split[idx])
                idx += 1
        updated_split_corpus.append(updated_split)
    split_corpus = updated_split_corpus

In [16]:
file_path = "vocabulary.txt"

with open(file_path, "w", encoding="utf-8") as f:
    for token in vocabulary:
        f.write(token + "\n")

print(f"Vocabulary written to {file_path}")


Vocabulary written to vocabulary.txt


In [17]:
tokenized_text = []
for sentence in split_text:
    tokenized_sentence = []
    for word in sentence:
        start_idx = 0
        while start_idx < len(word):
            marker = '##' if start_idx != 0 else ''
            end_idx = len(word)-1
            error_flag = True
            while end_idx >= start_idx:
                if marker + word[start_idx:end_idx+1] in vocabulary:
                    tokenized_sentence.append(marker + word[start_idx:end_idx+1])
                    start_idx = end_idx+1
                    error_flag = False
                    break
                else:
                    end_idx -= 1
            if error_flag:
                print(f"Error in tokenizing {word}")
                break
    tokenized_text.append(tokenized_sentence)

In [18]:
file_path = "tokenized.txt"

with open(file_path, "w", encoding="utf-8") as f:
    for sentence in tokenized_text:
        f.write(" ".join(sentence) + "\n")

print(f"Corpus written to {file_path}")


Corpus written to tokenized.txt
