## Dataset Corpus creation

In [1]:
import pandas as pd
import glob
import os

# files = glob.glob(os.path.join("data", '*.csv'))
# print(files)
movie_reviews = 'data\\tamil_movie_reviews_train.csv'
news = 'data\\tamil_news_train.csv'
thirukural = 'data\\tamil_thirukkural_train.csv'

corpus = ""

movie_reviews_df = pd.read_csv(movie_reviews)
for index, row in movie_reviews_df.iterrows():
    corpus += "\n" + row['ReviewInTamil']

news_df = pd.read_csv(news)
for index, row in news_df.iterrows():
    corpus += "\n" + row['NewsInTamil']

thirukural_df = pd.read_csv(thirukural)
for index, row in thirukural_df.iterrows():
    corpus += "\n" + row['kural'] + " " + row['mk']  + " " +   row['mv']  + " " + row['sp']

In [7]:
# save the corpus to a file
with open(os.path.join("data", 'corpus.txt'), 'w', encoding='utf-8') as f:
    f.write(corpus)

In [8]:
del corpus

# SentencePiece based BPE tokenizer for Tamil   

In [2]:
!pip install -q sentencepiece


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import sentencepiece as spm

https://github.com/google/sentencepiece/blob/master/doc/options.md

In [None]:
# train a sentencepiece model on it
import os

options = dict(
  # input spec
  input="data/corpus.txt",
  input_format="text",
  # output spec
  model_prefix="Tamil10k_BPE", # output filename prefix
  # algorithm spec
  # BPE alg
  model_type="bpe",
  vocab_size=10000,
  # normalization
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=150136, # max number of training sentences
  max_sentence_length=4192, # max number of bytes per sentence
  seed_sentencepiece_size=1000000, 
  shuffle_input_sentence=True,
  # rare word treatment
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=16,
  add_dummy_prefix=True,
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=0, # the UNK token MUST exist
  bos_id=1, # the others are optional, set to -1 to turn off
  eos_id=2,
  pad_id=-1,
  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

spm.SentencePieceTrainer.train(**options)

In [23]:
sp = spm.SentencePieceProcessor()
sp.load('Tamil10k_BPE.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
len(vocab)

10000

In [24]:
with open("data/corpus.txt", "r", encoding="utf-8") as f:
    corpus = [line.strip() for line in f if line.strip()]

original_vocab_size = len(" ".join(corpus).split(" "))
original_vocab_size

150136

In [25]:
compression_ratio = original_vocab_size / len(vocab)
compression_ratio

15.0136

In [20]:
ids = sp.encode("தமிழ் மொழி பற்றி அறிந்து கொள்ளுங்கள்")
print(ids)

[1592, 5773, 1339, 1171, 696, 2111]


In [21]:
print([sp.id_to_piece(idx) for idx in ids])

['▁தமிழ்', '▁மொழி', '▁பற்றி', '▁அறிந்து', '▁கொள்ள', 'ுங்கள்']


In [22]:
text = sp.decode(ids)
print(text)

தமிழ் மொழி பற்றி அறிந்து கொள்ளுங்கள்
