This is an experimental notebook, testing out multiple different configurations and evaluating their performance in generating best suiting sentence embeddings for RAG lookup in sinhala

#### Imports and initial testing

In [None]:
!pip install transformers torch accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

In [None]:
from google.colab import userdata
my_secret_key = userdata.get('HF_TOKEN')

In [None]:
from huggingface_hub import login
login(token=my_secret_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Load the XLM-RoBERTa model and tokenizer
model_name = "FacebookAI/xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [None]:
# Input sentence
sentence = "ආරක්ෂක සෙනෙට් සභිකයා මෙම මැතිවරණයට එක්වනු ඇත?"

# Tokenize the input sentence
tokens = tokenizer(sentence)

# Print the tokens
print("Tokens:", tokens)

# Convert token IDs back to the original sentence
reconstructed_sentence = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)

# Print the reconstructed sentence
print("Reconstructed Sentence:", reconstructed_sentence)

Tokens: {'input_ids': [0, 125784, 45264, 28935, 10820, 7589, 28736, 17775, 7029, 9711, 142540, 722, 149081, 15064, 10089, 32, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Reconstructed Sentence: ආරක්ෂක සෙනෙට් සභිකයා මෙම මැතිවරණයට එක්වනු ඇත?


In [None]:
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt")

    # Get dynamic embeddings by passing through the model
    with torch.no_grad():
        outputs = model(**inputs)
        dynamic_embeddings = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]

    return dynamic_embeddings

In [None]:
embeddings_1 = get_embeddings("ඔබට කෙසේද?")
embeddings_2 = get_embeddings("ආරක්ෂක සෙනෙට් සභිකයා මෙම මැතිවරණයට එක්වනු ඇත.")
embeddings_3 = get_embeddings("ඔයාගේ නම කුමක් ද.")

# Print shape and embeddings per each sentence
print(embeddings_1.shape)
print(embeddings_2.shape)
print(embeddings_3.shape)

torch.Size([1, 6, 1024])
torch.Size([1, 17, 1024])
torch.Size([1, 7, 1024])


In [None]:
import torch
import torch.nn.functional as F

sinhala_sentences = [
    "ඔබට කෙසේද?",  # How are you?
    "ඔයාගේ නම කුමක් ද.",  # I want to go with you.
    "ඔබේ නම මට මතකයි.",  # I remember your name.
    "මගේ නම ඇල්බට්",  # Do you want to talk?
    "අහස ඉතා නිල් වේ.",  # The sky is very blue.
    "ආරක්ෂක සෙනෙට් සභිකයා මෙම මැතිවරණයට එක්වනු ඇත.",  # I need to talk about a celebration.
    "ඔවුන් පාර්ලිමේන්තුවට ගියහ.",  # They went to the parliament.
    "මෙම ව්‍යාපාරය සාර්ථකයි."  # This business is successful.
]

# Function to get embeddings and calculate distances
def get_embeddings_and_evaluate(sentences):
    embeddings = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
            dynamic_embedding = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
            embeddings.append(dynamic_embedding)

    return embeddings

# Calculate distances between sentences
def euclidean_distance(vec1, vec2):
    return torch.norm(vec1 - vec2).item()

def cosine_similarity(vec1, vec2):
    return F.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0)).item()

# Get embeddings
embeddings = get_embeddings_and_evaluate(sinhala_sentences)

# Evaluate distances
for i in range(len(embeddings)):
    for j in range(i + 1, len(embeddings)):
        euclidean_dist = euclidean_distance(embeddings[i], embeddings[j])
        # cosine_sim = cosine_similarity(embeddings[i], embeddings[j])
        print(f"Distance between '{sinhala_sentences[i]}' and '{sinhala_sentences[j]}':")
        print(f"  Euclidean Distance: {euclidean_dist}")

Distance between 'ඔබට කෙසේද?' and 'ඔයාගේ නම කුමක් ද.':
  Euclidean Distance: 2.294182538986206
Distance between 'ඔබට කෙසේද?' and 'ඔබේ නම මට මතකයි.':
  Euclidean Distance: 2.1887216567993164
Distance between 'ඔබට කෙසේද?' and 'මගේ නම ඇල්බට්':
  Euclidean Distance: 2.191248655319214
Distance between 'ඔබට කෙසේද?' and 'අහස ඉතා නිල් වේ.':
  Euclidean Distance: 2.346013307571411
Distance between 'ඔබට කෙසේද?' and 'ආරක්ෂක සෙනෙට් සභිකයා මෙම මැතිවරණයට එක්වනු ඇත.':
  Euclidean Distance: 2.414931058883667
Distance between 'ඔබට කෙසේද?' and 'ඔවුන් පාර්ලිමේන්තුවට ගියහ.':
  Euclidean Distance: 2.1917483806610107
Distance between 'ඔබට කෙසේද?' and 'මෙම ව්‍යාපාරය සාර්ථකයි.':
  Euclidean Distance: 2.293318510055542
Distance between 'ඔයාගේ නම කුමක් ද.' and 'ඔබේ නම මට මතකයි.':
  Euclidean Distance: 1.597105622291565
Distance between 'ඔයාගේ නම කුමක් ද.' and 'මගේ නම ඇල්බට්':
  Euclidean Distance: 2.3448712825775146
Distance between 'ඔයාගේ නම කුමක් ද.' and 'අහස ඉතා නිල් වේ.':
  Euclidean Distance: 1.70623302459

#### Dynamic word-wise embedding

In [None]:
def get_token_wise_dynamic_embeddings(text, tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
    return token_embeddings

#### Static word-wise embedding

In [None]:
embedding_layer = model.embeddings
token_embedding_layer = embedding_layer.word_embeddings.weight.detach()

def get_token_wise_static_embeddings(text, tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    token_ids = inputs["input_ids"]
    token_embeddings = token_embedding_layer[token_ids]
    return token_embeddings

#### Dynamic sentence-wise embedding

In [None]:
def get_sentence_wise_dynamic_embedding(text, tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        sentence_embedding = outputs.last_hidden_state[:, 0, :]
    return sentence_embedding

#### Word-wise embedding testing

In [None]:
text = "මෙම ආරක්ෂක සෙනෙට් සභිකයා මෙම මැතිවරණයට එක්වනු ඇත මෙම."
token_wise_dynamic_embeddings = get_token_wise_dynamic_embeddings(text, tokenizer)
token_wise_static_embeddings = get_token_wise_static_embeddings(text, tokenizer)
sentence_wise_dynamic_embedding = get_sentence_wise_dynamic_embedding(text, tokenizer)

print(token_wise_dynamic_embeddings.shape)
print(token_wise_static_embeddings.shape)
print(sentence_wise_dynamic_embedding.shape)

print(token_wise_dynamic_embeddings)
print(token_wise_static_embeddings)
print(sentence_wise_dynamic_embedding)

torch.Size([1, 19, 1024])
torch.Size([1, 19, 1024])
torch.Size([1, 1024])
tensor([[[-0.0182, -0.0390,  0.1138,  ..., -0.0640, -0.0653, -0.0171],
         [ 0.0100,  0.0484,  0.1949,  ...,  0.0253, -0.0969,  0.1234],
         [-0.0016,  0.0361, -0.0795,  ...,  0.1508,  0.2554,  0.2770],
         ...,
         [-0.0054, -0.1017,  0.0493,  ...,  0.2182, -0.3217,  0.0412],
         [ 0.0054, -0.0532,  0.0386,  ..., -0.0453,  0.0040,  0.0087],
         [ 0.1303,  0.0337,  0.1697,  ...,  0.0687,  0.0150, -0.0134]]])
tensor([[[-0.0562, -0.0288, -0.0794,  ...,  0.0294,  0.0267, -0.0297],
         [ 0.0484,  0.0211, -0.1639,  ...,  0.2180,  0.1237,  0.2499],
         [ 0.1247,  0.0020, -0.2040,  ...,  0.2489,  0.2612,  0.1337],
         ...,
         [ 0.0484,  0.0211, -0.1639,  ...,  0.2180,  0.1237,  0.2499],
         [ 0.0282,  0.0848, -0.1306,  ...,  0.0290,  0.2118,  0.2537],
         [-0.0147,  0.0129, -0.1052,  ...,  0.1180,  0.1378,  0.2031]]])
tensor([[-0.0182, -0.0390,  0.1138,  ..., 

In [None]:
def sort_sinhala_sentences_cosine(sentences, embedding_function, tokenizer):
    # Get embeddings for all sentences using the custom function

    embeddings = []

    for sentence in sentences:
      embedding = embedding_function(sentence, tokenizer)

      if len(embedding.shape) == 3:
        embedding = embedding[0].mean(dim=0)
      elif len(embedding.shape) == 2:
        embedding = embedding[0]

      embeddings.append(embedding)

    embeddings = torch.stack(embeddings)
    print(embeddings.shape)

    # Get the embedding of the first sentence (assuming we want the mean of token embeddings)
    first_embedding = embeddings[0]

    # Calculate cosine similarity distances
    distances = torch.nn.functional.cosine_similarity(first_embedding.unsqueeze(0),
                                                       embeddings,  # Mean pooling across tokens
                                                       dim=1)

    # Sort sentences based on distance (descending order)
    sorted_indices = torch.argsort(distances, descending=True)
    sorted_sentences = [[sentences[i], distances[i]] for i in sorted_indices]

    return sorted_sentences

# Sample Sinhala sentences
sinhala_sentences = [
    "පාපන්දු ක්‍රීඩකයෙක් පන්දුවට පයින් ගසයි",
    "ආහාර ඔබට පෝෂණය ලබා දෙයි",
    "සංස්කෘතිය අපේ සම්පත් එකකි.",
    "ආහාරය කෑම වැදගත් වේ.",
    "පාපන්දු ක්‍රීඩකයෝ පිටියේ සිටිති.",
    "මෙම වර්ෂයේ ලෝක කුසලාන තරඟය ඉතා රසවත් වේ.",
    "මට කෑම වලට ලොකු ඇල්මක් තියෙනවා.",
    "අවසන් තරගය ත්‍රාසජනක සංදර්ශනයක් විය.",
    "මම රසවත් ආහාර වේලක් භුක්ති විඳිනවා, විශේෂයෙන් තරඟයකින් පසුව",
    "චිකන් සහ ස්පයිසි කරි හොඳ සංයෝජනයක්.",
    "ආච්චි යතුරකින් වීදුරු සේප්පුවක් කඩනවා"
]

# Call the function and print the sorted sentences
sorted_sentences = sort_sinhala_sentences_cosine(sinhala_sentences, get_token_wise_dynamic_embeddings, tokenizer)
print("Sorted Sinhala Sentences by Similarity to First Element (Dynamic):")
for sentence, distance in sorted_sentences:
    print(f"{distance}: {sentence}")

sorted_sentences = sort_sinhala_sentences_cosine(sinhala_sentences, get_token_wise_static_embeddings, tokenizer)
print("\nSorted Sinhala Sentences by Similarity to First Element (Static):")
for sentence, distance in sorted_sentences:
    print(f"{distance}: {sentence}")

sorted_sentences = sort_sinhala_sentences_cosine(sinhala_sentences, get_sentence_wise_dynamic_embedding, tokenizer)
print("\nSorted Sinhala Sentences by Similarity to First Element (Dynamic-sentence):")
for sentence, distance in sorted_sentences:
    print(f"{distance}: {sentence}")

torch.Size([11, 1024])
Sorted Sinhala Sentences by Similarity to First Element (Dynamic):
1.000000238418579: පාපන්දු ක්‍රීඩකයෙක් පන්දුවට පයින් ගසයි
0.9978391528129578: ආච්චි යතුරකින් වීදුරු සේප්පුවක් කඩනවා
0.9977038502693176: අවසන් තරගය ත්‍රාසජනක සංදර්ශනයක් විය.
0.9976841807365417: මම රසවත් ආහාර වේලක් භුක්ති විඳිනවා, විශේෂයෙන් තරඟයකින් පසුව
0.9976675510406494: පාපන්දු ක්‍රීඩකයෝ පිටියේ සිටිති.
0.997266411781311: චිකන් සහ ස්පයිසි කරි හොඳ සංයෝජනයක්.
0.9968658685684204: ආහාර ඔබට පෝෂණය ලබා දෙයි
0.9967818856239319: මෙම වර්ෂයේ ලෝක කුසලාන තරඟය ඉතා රසවත් වේ.
0.9966422915458679: මට කෑම වලට ලොකු ඇල්මක් තියෙනවා.
0.9956303834915161: ආහාරය කෑම වැදගත් වේ.
0.99541175365448: සංස්කෘතිය අපේ සම්පත් එකකි.
torch.Size([11, 1024])

Sorted Sinhala Sentences by Similarity to First Element (Static):
0.9999998807907104: පාපන්දු ක්‍රීඩකයෙක් පන්දුවට පයින් ගසයි
0.9568371772766113: පාපන්දු ක්‍රීඩකයෝ පිටියේ සිටිති.
0.9303717017173767: මම රසවත් ආහාර වේලක් භුක්ති විඳිනවා, විශේෂයෙන් තරඟයකින් පසුව
0.9274910092353821: ආච්ච

In [None]:
def sort_sinhala_sentences_euclidean(sentences, embedding_function, tokenizer):
    # Get embeddings for all sentences using the custom function
    embeddings = []
    for sentence in sentences:
        embedding = embedding_function(sentence, tokenizer)
        # Take the mean across tokens if embedding is 3-dimensional
        if len(embedding.shape) == 3:
            embedding = embedding[0].mean(dim=0)
        elif len(embedding.shape) == 2:
          embedding = embedding[0]
        embeddings.append(embedding)

    embeddings = torch.stack(embeddings)

    # Get the embedding of the first sentence
    first_embedding = embeddings[0]

    # Calculate Euclidean distances
    distances = torch.norm(embeddings - first_embedding, dim=1)

    # Sort sentences based on distance (ascending order)
    sorted_indices = torch.argsort(distances)
    sorted_sentences = [[sentences[i], distances[i]] for i in sorted_indices]

    return sorted_sentences

# Sample Sinhala sentences
sinhala_sentences = [
    "පාපන්දු ක්‍රීඩකයෙක් පන්දුවට පයින් ගසයි",
    "ආහාර ඔබට පෝෂණය ලබා දෙයි",
    "සංස්කෘතිය අපේ සම්පත් එකකි.",
    "ආහාරය කෑම වැදගත් වේ.",
    "පාපන්දු ක්‍රීඩකයෝ පිටියේ සිටිති.",
    "මෙම වර්ෂයේ ලෝක කුසලාන තරඟය ඉතා රසවත් වේ.",
    "මට කෑම වලට ලොකු ඇල්මක් තියෙනවා.",
    "අවසන් තරගය ත්‍රාසජනක සංදර්ශනයක් විය.",
    "මම රසවත් ආහාර වේලක් භුක්ති විඳිනවා, විශේෂයෙන් තරඟයකින් පසුව",
    "චිකන් සහ ස්පයිසි කරි හොඳ සංයෝජනයක්.",
    "ආච්චි යතුරකින් වීදුරු සේප්පුවක් කඩනවා"
]

# Call the function and print the sorted sentences
sorted_sentences = sort_sinhala_sentences_euclidean(sinhala_sentences, get_token_wise_dynamic_embeddings, tokenizer)
print("Sorted Sinhala Sentences by Similarity to First Element (Dynamic):")
for sentence, distance in sorted_sentences:
    print(f"{distance}: {sentence}")

sorted_sentences = sort_sinhala_sentences_euclidean(sinhala_sentences, get_token_wise_static_embeddings, tokenizer)
print("\nSorted Sinhala Sentences by Similarity to First Element (Static):")
for sentence, distance in sorted_sentences:
    print(f"{distance}: {sentence}")

sorted_sentences = sort_sinhala_sentences_euclidean(sinhala_sentences, get_sentence_wise_dynamic_embedding, tokenizer)
print("\nSorted Sinhala Sentences by Similarity to First Element (Dynamic-sentence):")
for sentence, distance in sorted_sentences:
    print(f"{distance}: {sentence}")

Sorted Sinhala Sentences by Similarity to First Element (Dynamic):
0.0: පාපන්දු ක්‍රීඩකයෙක් පන්දුවට පයින් ගසයි
2.011542797088623: ආච්චි යතුරකින් වීදුරු සේප්පුවක් කඩනවා
2.07136607170105: අවසන් තරගය ත්‍රාසජනක සංදර්ශනයක් විය.
2.0744059085845947: මම රසවත් ආහාර වේලක් භුක්ති විඳිනවා, විශේෂයෙන් තරඟයකින් පසුව
2.10471510887146: පාපන්දු ක්‍රීඩකයෝ පිටියේ සිටිති.
2.266664743423462: චිකන් සහ ස්පයිසි කරි හොඳ සංයෝජනයක්.
2.4402577877044678: ආහාර ඔබට පෝෂණය ලබා දෙයි
2.4787099361419678: මෙම වර්ෂයේ ලෝක කුසලාන තරඟය ඉතා රසවත් වේ.
2.529214382171631: මට කෑම වලට ලොකු ඇල්මක් තියෙනවා.
2.8879141807556152: ආහාරය කෑම වැදගත් වේ.
2.9531421661376953: සංස්කෘතිය අපේ සම්පත් එකකි.

Sorted Sinhala Sentences by Similarity to First Element (Static):
0.0: පාපන්දු ක්‍රීඩකයෙක් පන්දුවට පයින් ගසයි
0.9254924058914185: පාපන්දු ක්‍රීඩකයෝ පිටියේ සිටිති.
1.2006688117980957: මම රසවත් ආහාර වේලක් භුක්ති විඳිනවා, විශේෂයෙන් තරඟයකින් පසුව
1.2344388961791992: ආච්චි යතුරකින් වීදුරු සේප්පුවක් කඩනවා
1.258924961090088: චිකන් සහ ස්පයිසි කරි හොඳ ස

In [None]:
# Test static embedding per word
sinhala_words = {
    # Relatives
    "අම්මා": True,    # Mother
    "තාත්තා": True,  # Father
    "පුතා": True,     # Son
    "දුව": True,      # Daughter
    "මහත්තයා": True, # Husband
    "බිරිඳ": True,    # Wife
    "සහෝදරයා": True, # Brother
    "සහෝදරිය": True, # Sister
    "නෑනා": True,     # Uncle
    "මහත්තයා": True, # Gentleman (elder male)
    "අක්කා": True,    # Older sister
    "අයියා": True,    # Older brother
    "රථය": True,        # Car
    "බස්": True,         # Bus
    "ලොරිය": True,      # Lorry
    "මෝටර්සයිකලය": True, # Motorcycle
    "බයිසිකලය": True,    # Bicycle
    "නැව": True,          # Ship/Boat
    "වෘකයා": True,      # Wolf
    "වලසා": True,       # Fox
    "කුකුලා": True,     # Chicken
    "ගෝනා": True,       # Cow/Bull
    "ගිරවා": True,      # Parrot
    "කිඹුලා": True,     # Crocodile
    "හාවා": True,       # Rabbit
    "බල්ලා": True,      # Dog
    "අලියා": True,      # Elephant
    "මකුණා": True,      # Fly
}
# for sentence in sinhala_sentences:
#     for word in sentence.split():
#         sinhala_words[word] = True

def get_word_embedding(word):
    # Tokenize the word and get input IDs
    inputs = tokenizer(word, return_tensors="pt", add_special_tokens=False)
    token_ids = inputs["input_ids"]

    # Retrieve embeddings for token IDs
    token_embeddings = token_embedding_layer[token_ids]

    # For single-word inputs, return the mean of token embeddings, in case it splits into multiple tokens
    word_embedding = token_embeddings.mean(dim=1).squeeze(0)
    return word_embedding

def get_word_embeddings(word_dict, embedding_function):
    """
    Get static embeddings for each word.
    """
    embeddings = {}
    for word in word_dict:
        embeddings[word] = embedding_function(word)
    return embeddings

def sort_words_by_euclidean(target_word, word_embeddings):
    """
    Sort words by Euclidean distance to the target word.
    """
    target_embedding = word_embeddings[target_word]
    distances = {}

    for word, embedding in word_embeddings.items():
        distances[word] = torch.norm(embedding - target_embedding)

    sorted_words = sorted(distances.items(), key=lambda x: x[1])  # Sort by distance
    return sorted_words

def sort_words_by_cosine_similarity(target_word, word_embeddings):
    """
    Sort words by cosine similarity to the target word.
    """
    target_embedding = word_embeddings[target_word]
    similarities = {}

    for word, embedding in word_embeddings.items():
        similarity = torch.nn.functional.cosine_similarity(
            target_embedding.unsqueeze(0), embedding.unsqueeze(0)
        )
        similarities[word] = similarity.item()

    sorted_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)  # Sort by similarity
    return sorted_words

# Example usage:
# Assume sinhala_words is your dictionary of unique words and embedding_function is defined
word_embeddings = get_word_embeddings(sinhala_words, get_word_embedding)

# Sort by Euclidean distance
sorted_by_euclidean = sort_words_by_euclidean("රථය", word_embeddings)
print(sorted_by_euclidean)

# Sort by cosine similarity
sorted_by_cosine_similarity = sort_words_by_cosine_similarity("රථය", word_embeddings)
print(sorted_by_cosine_similarity)


[('රථය', tensor(0.)), ('මෝටර්සයිකලය', tensor(3.4834)), ('බයිසිකලය', tensor(3.6822)), ('වෘකයා', tensor(3.8918)), ('ගිරවා', tensor(3.9570)), ('බල්ලා', tensor(3.9878)), ('අලියා', tensor(4.0031)), ('ලොරිය', tensor(4.0526)), ('මකුණා', tensor(4.0745)), ('කිඹුලා', tensor(4.1068)), ('සහෝදරිය', tensor(4.1380)), ('කුකුලා', tensor(4.1419)), ('නැව', tensor(4.1887)), ('අක්කා', tensor(4.1988)), ('සහෝදරයා', tensor(4.2195)), ('ගෝනා', tensor(4.2482)), ('වලසා', tensor(4.2542)), ('හාවා', tensor(4.3466)), ('නෑනා', tensor(4.3740)), ('බිරිඳ', tensor(4.6151)), ('බස්', tensor(4.6200)), ('දුව', tensor(4.8071)), ('මහත්තයා', tensor(4.8683)), ('පුතා', tensor(4.9390)), ('අම්මා', tensor(4.9434)), ('අයියා', tensor(4.9692)), ('තාත්තා', tensor(5.0859))]
[('රථය', 1.0000001192092896), ('මෝටර්සයිකලය', 0.6564677357673645), ('බයිසිකලය', 0.619088351726532), ('අලියා', 0.5971981287002563), ('සහෝදරිය', 0.5745132565498352), ('බිරිඳ', 0.574262797832489), ('බල්ලා', 0.5729954838752747), ('වෘකයා', 0.5686779022216797), ('ගිරවා', 0.5

#### Load text for chunking

In [None]:
from google.colab import drive
drive.mount('/content/drive')

def split_text_into_chunks(file_path, word_count_per_chunk):
  """
  Reads a text file, splits it into chunks based on word count, and returns a list of chunks.
  """
  with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

  words = text.split()
  chunks = []
  current_chunk = []
  for word in words:
    current_chunk.append(word)
    if len(current_chunk) >= word_count_per_chunk:
      chunks.append(' '.join(current_chunk))
      current_chunk = []

  if current_chunk:  # Append the remaining words if any
    chunks.append(' '.join(current_chunk))

  return chunks

# Path to your text file
file_path = '/content/drive/MyDrive/LLM_Tasks/ChatBot/Assets/Sri Lanka Constitution-Sinhala.txt'

# Desired word count per chunk
token_count = 128
word_count_per_chunk = token_count * 120 // 267

# Split the text into chunks
chunks = split_text_into_chunks(file_path, word_count_per_chunk)

# Print the number of chunks and the first few chunks as an example
print(f"Number of chunks: {len(chunks)}")
for i in range(min(5, len(chunks))):
  print(f"Chunk {i+1}: {chunks[i]}")

Mounted at /content/drive
Number of chunks: 1126
Chunk 1: ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ පනත් කෙටුම්පත් කාර්යාංශය විසින් සංස්කරණය කරන ලද මෙම නිල නොවන ප්‍රතිශෝධිත මුද්‍රණය මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ විසිඑක්වන සංශෝධනය දක්වා පාර්ලිමේන්තුව විසින්
Chunk 2: වරින් වර සංශෝධනය කරන ලද ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව නැවත ප්‍රකාශයට පත් කරනු ලැබේ. අදාළ පිටු අග ඇති සටහන් මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාව සංශෝධනය කළ ඒ ඒ සංශෝධන දැක් වේ. I වන පරිච්ඡේදය ජනතාව, රජය සහ පරමාධිපත්‍යය 1. ශ්‍රී ලංකාව නිදහස්, ස්වෛරී, ස්වාධීන, ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයකි. ශ්‍රී ලංකාව, ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජය යනුවෙන් හඳුන්වනු
Chunk 3: ලැබේ. 2. ලංකා ජනරජය ඒකීය රජයකි. 3. ශ්‍රී ලංකා ජනරජයේ පරමාධිපත්‍යය ජනතාව කෙරෙහ

In [None]:
sample = chunks[0]
tokenized_chunk = tokenizer(sample, return_tensors='pt', padding=True, truncation=True)

print(tokenized_chunk['input_ids'][0].shape)
print(tokenized_chunk['input_ids'])

embedded_chunk = get_token_wise_static_embeddings(chunks[0], tokenizer)
print(embedded_chunk.shape)

torch.Size([129])
tensor([[     0,   7988,   5418,  19070,   1637, 133666,  13216,   2148,   1386,
          16223,   2029,  37615,  53816,  40882,   3335,  23155,   4312, 120206,
            942,  29424,  14451, 162005,     15, 151159, 202829, 142705, 168042,
          98364,   1936,  37529,  16287,  49968,  21593,  27436,  11078,  13831,
          22993,   2009,     16,     15,   8717,  11510,  25417,    942,  13236,
         196027,  49968,  21593,  27436,  11078,  13831,   2890, 162251,  17372,
           6085,  36250,     16,   1637,  57948,  27436,  11078,  13831,  22993,
         195852,      6,  57725, 140429, 103018,  18626,  84599, 130850,  52230,
           1637,  28711,  22993,   2009,      5,   7988,   5418,  19070, 211111,
          14451, 185770,  74754,  92625, 134263,   1188, 132278,   3590,  70542,
            722,  35553,  34599,  91174, 161666,   1131,  13238,      6, 212964,
           6085,  36250,   9711, 108318,  71380,   1637,  57948,  27436,  11078,
          

#### Mean pooling

#### TF-IDF pooling

In [None]:
import torch

# Example list of tensors
tensor_list = [torch.randn(10, 50) for _ in range(5)]  # List of [tokens, embedding_dim] tensors
weight_list = [torch.rand(10) for _ in range(5)]        # List of [tokens] weight tensors

# Stack the tensors and weights
stacked_tensors = torch.stack(tensor_list)   # Shape: [num_tensors, tokens, embedding_dim]
stacked_weights = torch.stack(weight_list)   # Shape: [num_tensors, tokens]
print(stacked_tensors)
print(stacked_weights)

# Expand weights to match the tensor dimensions for broadcasting
expanded_weights = stacked_weights.unsqueeze(-1)  # Shape: [num_tensors, tokens, 1]

# Element-wise multiply and sum over all tensors
weighted_sum = (stacked_tensors * expanded_weights).sum(dim=0)  # Shape: [tokens, embedding_dim]
print(weighted_sum)

# Sum all weights to normalize
sum_weights = stacked_weights.sum(dim=0, keepdim=True)  # Shape: [1, tokens]
print(sum_weights)
normalized_output = weighted_sum / sum_weights.T  # Final shape: [tokens, embedding_dim]
print(normalized_output)

tensor([[[ 0.2153,  0.4921, -0.6265,  ...,  1.3016,  0.9994, -0.1390],
         [ 0.8322,  0.2636, -0.2356,  ..., -1.3547, -0.1448,  0.9996],
         [ 0.3082,  1.4038, -0.6153,  ...,  0.1792,  0.6280, -0.9640],
         ...,
         [-0.9023,  0.2659,  0.7394,  ...,  0.2594, -0.4129, -0.5549],
         [ 0.6458,  0.8583,  0.6202,  ..., -0.0410,  0.1711, -0.9131],
         [-0.2278, -1.2793,  0.2871,  ...,  0.6449, -0.0321,  0.2530]],

        [[-2.1080,  0.7545, -1.7916,  ...,  0.4332, -1.2807,  0.1734],
         [-0.1024, -0.5461, -0.6954,  ..., -0.0844,  0.0452, -0.3752],
         [ 0.3126,  1.1201,  0.0793,  ..., -0.3815,  0.7782, -0.5205],
         ...,
         [-0.1687, -0.8708,  0.8317,  ...,  0.7059, -0.8904,  0.6481],
         [-0.0639, -1.6631, -1.4152,  ...,  1.1963,  1.0938, -0.0082],
         [-1.7301, -1.6752,  1.1769,  ...,  1.0620, -0.9751, -0.6347]],

        [[ 0.0884, -1.6853,  0.6270,  ..., -1.2431, -0.7114,  1.9554],
         [ 1.1417,  0.5487, -1.1804,  ..., -0

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
# Generate TF-IDF dictionary
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm
import faiss
import math
from concurrent.futures import ThreadPoolExecutor, as_completed

class TFIDFDictionary:
    def __init__(self, tokenizer, embedding_function):
        self.tokenizer = tokenizer
        self.embedding_function = embedding_function
        self.chunks = None
        self._df = None
        self._chunk_embeddings = None
        self._index = None

    def initialize_chunk_embeddings(self, chunks):
        self.chunks = chunks

        # Tokenize chunks
        tokenized_chunks = self._get_tokenized_chunks(chunks, self.tokenizer)        # List of Tensors of shape [token_count]

        # Vectorize chunks
        embedded_chunks = self._get_embedded_chunks(chunks, self.tokenizer)          # List of Tensors of shape [token_count, embedding_dim]

        # Generate occurance dataframe to calculate tf-idf weights
        self._df = self._generate_dictionary(tokenized_chunks)
        print(self._df)
        # print(self._df.at[0,6], self._df.at[2,5], self._df.at[3,4])

        # Calculate the weight matrix
        weight_matrix = self._generate_weight_matrix(tokenized_chunks, self._df)    # List of Weights of shape [token_count]
        print(weight_matrix[:3])

        # Get complete chunk vectors
        self._chunk_embeddings = self._get_chunk_embeddings(embedded_chunks, weight_matrix)

        # Setup FAISS for lookup
        self._setup_faiss()

    def print_dictionary(self):
        print(self._df)

    def set_chunks(self, chunks):
        self.chunks = chunks
        if len(self.chunks) != self._chunk_embeddings.shape[0] and self._chunk_embeddings is not None:
            raise Exception("Chunk count and embedding count does not match")

    def save_dictionary(self, path):
        self._df.to_csv(path)

    def load_dictionary(self, path):
        self._df = pd.read_csv(path, index_col=0)

    def save_embeddings(self, path):
        torch.save(self._chunk_embeddings, path)

    def load_embeddings(self, path):
        self._chunk_embeddings = torch.load(path)
        if len(self.chunks) != self._chunk_embeddings.shape[0] and self.chunks is not None:
            raise Exception("Chunk count and embedding count does not match")

    def _get_tokenized_chunks(self, chunks, tokenizer):
        tokenized_chunks = []
        for chunk in tqdm(chunks, desc="Tokenizing chunks: ", leave=False):
            tokenized_chunks.append(self.tokenizer(chunk, return_tensors='pt', padding=True, truncation=True)['input_ids'])
        return tokenized_chunks

    def _get_embedded_chunks(self, chunks, tokenizer):
        embedded_chunks = []
        for chunk in tqdm(chunks, desc="Embedding chunks: ", leave=False):
            embedded_chunks.append(self.embedding_function(chunk, tokenizer))
        return embedded_chunks

    def _get_chunk_embeddings(self, embedded_chunks, weight_matrix):
        # Initialize lists to store results
        weighted_sums = []
        sum_weights = []

        # Iterate through the chunks and weights
        for embedded, weights in zip(embedded_chunks, weight_matrix):
            # Compute the weighted sum for each chunk
            expanded_weights = weights.unsqueeze(-1)  # Shape [m, 1]
            weighted_sum = (embedded * expanded_weights).sum(dim=0)  # Shape [1024]

            # Store the weighted sum and sum of weights
            weighted_sums.append(weighted_sum)
            sum_weights.append(weights.sum(dim=0))  # Shape [m]

        # Stack the results
        stacked_weighted_sums = torch.stack(weighted_sums)  # Shape [n, 1024]
        stacked_sum_weights = torch.stack(sum_weights)  # Shape [n]

        # Normalize the output
        normalized_output = stacked_weighted_sums / stacked_sum_weights.unsqueeze(-1)  # Broadcasting to [n, 1024]

        return normalized_output

    def _process_chunk(self, chunk):
        chunk = chunk[0].tolist()
        token_count = Counter(chunk)
        total_tokens = len(chunk)
        return token_count, total_tokens

    def _generate_dictionary(self, tokenized_chunks):
        token_counts_list = []
        unique_tokens = set()

        for chunk in tqdm(tokenized_chunks, desc="Processing Chunks: ", leave=False):
            # Count occurrences of tokens in the chunk
            chunk = chunk[0].tolist()
            token_count = Counter(chunk)
            total_tokens = len(chunk)  # Count total tokens in the chunk

            token_counts_list.append((token_count, total_tokens))
            unique_tokens.update(token_count.keys())

        # Create DataFrame
        df = pd.DataFrame(columns=sorted(unique_tokens) + ['total_tokens'])

        # Populate DataFrame with token counts and total tokens
        for i, (token_count, total_tokens) in tqdm(enumerate(token_counts_list), desc="Populating DataFrame: ", total=len(token_counts_list), leave=False):
            row = {token: count for token, count in token_count.items()}
            row['total_tokens'] = total_tokens
            df.loc[i] = row

        # Count occurrences across all chunks for each token
        occurrence_counts = {token: (df[token] > 0).sum() for token in sorted(unique_tokens)}
        occurrence_counts['total_tokens'] = len(tokenized_chunks)  # Total chunks count
        df.loc[len(tokenized_chunks)] = occurrence_counts  # Add this as the last row

        # Fill NaN values with 0
        df.fillna(0, inplace=True)

        return df

    def tf_idf(self, word_freq, total_words, num_docs_with_word, total_docs):

        tf = word_freq / total_words

        idf = math.log((total_docs + 1) / (num_docs_with_word + 1))  # Smoothing added to avoid division by zero

        # Calculate TF-IDF
        tf_idf_weight = tf * idf
        return tf_idf_weight

    def _generate_weight_matrix(self, tokenized_chunks, occurance_dictionary):
        weights = []
        for i, chunk in tqdm(enumerate(tokenized_chunks), desc="Generating Chunk Token Weights: "):
            weights_per_chunk = []
            chunk = chunk[0].tolist()
            for j, token in enumerate(chunk):
                if token not in occurance_dictionary.columns:
                    weights_per_chunk.append(0)
                else:
                    word_freq = occurance_dictionary.at[i, token]
                    total_words = occurance_dictionary.at[i, 'total_tokens']
                    num_docs_with_word = occurance_dictionary.at[len(occurance_dictionary) - 1, token]
                    total_docs = len(occurance_dictionary)

                    # Get tf-idf weight per word
                    weight = self.tf_idf(word_freq, total_words, num_docs_with_word, total_docs)
                    weights_per_chunk.append(weight)

            weights.append(torch.tensor(weights_per_chunk))
        return weights

    def embed_query(self, chunk):
        tokenized_chunk = self.tokenizer(chunk, return_tensors='pt', padding=True, truncation=True)['input_ids']

        tokenized_chunk = tokenized_chunk[0].tolist()
        token_count = Counter(tokenized_chunk)
        total_tokens = len(chunk)  # Count total tokens in the chunk

        weights = []
        for j, token in enumerate(tokenized_chunk):
            if token not in self._df.columns:
                weights.append(0)
            else:
                word_freq = token_count[token]
                num_docs_with_word = self._df.iloc[-1, self._df.columns.get_loc(f'{token}')]
                total_docs = len(self._df)

                # Get tf-idf weight per word
                weight = self.tf_idf(word_freq, total_tokens, num_docs_with_word, total_docs)
                weights.append(weight)

        return self._get_chunk_embeddings(self.embedding_function(chunk), torch.tensor(weights).unsqueeze(0))

    def _setup_faiss(self):
        self._index = faiss.IndexFlatL2(self._chunk_embeddings.shape[1])
        self._index.add(self._chunk_embeddings.numpy())

    def _faiss_lookup(self, query, k=5):
        if self._index is None:
            self._setup_faiss

        query_embedding = self.embed_query(query)
        distances, indices = self._index.search(query_embedding.numpy(), k)
        return indices, distances

    def lookup(self, query, k=5):
        indices, distances = self._faiss_lookup(query, k)
        return [(self.chunks[i], d) for i, d in zip(indices[0], distances[0])]

    def encode(self, words):
        """Encodes words using the provided tokenizer."""
        return self.tokenizer.encode(words, return_tensors='pt', padding=True, truncation=True)

    def decode(self, tokens):
        """Decodes tokens back to words using the provided tokenizer."""
        return self.tokenizer.decode(tokens, skip_special_tokens=True)

In [None]:
tfidf_dict = TFIDFDictionary(tokenizer, get_token_wise_static_embeddings)
tfidf_dict.initialize_chunk_embeddings(chunks[:5])

Tokenizing chunks:   0%|          | 0/5 [00:00<?, ?it/s]

Embedding chunks:   0%|          | 0/5 [00:00<?, ?it/s]

Processing Chunks:   0%|          | 0/5 [00:00<?, ?it/s]

Populating DataFrame:   0%|          | 0/5 [00:00<?, ?it/s]

   0  2    4    5    6    9   12   15   16   74  ...  211111  212964  213668  \
0  1  1  0.0  1.0  3.0  0.0  0.0  2.0  2.0  0.0  ...     1.0     1.0     0.0   
1  1  1  5.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0     0.0     0.0   
2  1  1  2.0  5.0  0.0  1.0  1.0  1.0  1.0  0.0  ...     0.0     0.0     0.0   
3  1  1  3.0  0.0  0.0  0.0  0.0  2.0  2.0  2.0  ...     1.0     0.0     1.0   
4  1  1  1.0  1.0  0.0  0.0  0.0  1.0  1.0  3.0  ...     0.0     0.0     0.0   
5  5  5  4.0  4.0  1.0  1.0  1.0  4.0  4.0  2.0  ...     2.0     1.0     1.0   

   223855  232260  234213  237195  240692  246551  total_tokens  
0     0.0     0.0     0.0     0.0     0.0     0.0           129  
1     0.0     0.0     0.0     1.0     0.0     0.0           127  
2     0.0     1.0     1.0     0.0     1.0     0.0           118  
3     0.0     0.0     0.0     0.0     0.0     0.0            93  
4     1.0     0.0     0.0     0.0     1.0     1.0            94  
5     1.0     1.0     1.0     1.0     2.0  

Generating Chunk Token Weights: : 0it [00:00, ?it/s]

[tensor([0.0012, 0.0087, 0.0052, 0.0087, 0.0174, 0.0066, 0.0066, 0.0066, 0.0066,
        0.0066, 0.0043, 0.0066, 0.0066, 0.0026, 0.0026, 0.0026, 0.0026, 0.0052,
        0.0048, 0.0052, 0.0036, 0.0052, 0.0052, 0.0097, 0.0097, 0.0097, 0.0097,
        0.0097, 0.0097, 0.0097, 0.0097, 0.0291, 0.0131, 0.0263, 0.0263, 0.0263,
        0.0388, 0.0194, 0.0052, 0.0052, 0.0097, 0.0097, 0.0194, 0.0048, 0.0194,
        0.0131, 0.0291, 0.0131, 0.0263, 0.0263, 0.0263, 0.0043, 0.0097, 0.0097,
        0.0131, 0.0087, 0.0052, 0.0174, 0.0194, 0.0263, 0.0263, 0.0263, 0.0388,
        0.0194, 0.0291, 0.0194, 0.0097, 0.0097, 0.0097, 0.0097, 0.0097, 0.0087,
        0.0174, 0.0043, 0.0388, 0.0194, 0.0026, 0.0087, 0.0052, 0.0087, 0.0066,
        0.0036, 0.0066, 0.0066, 0.0097, 0.0097, 0.0131, 0.0097, 0.0066, 0.0097,
        0.0097, 0.0097, 0.0043, 0.0097, 0.0097, 0.0026, 0.0052, 0.0291, 0.0097,
        0.0131, 0.0087, 0.0097, 0.0097, 0.0097, 0.0174, 0.0194, 0.0263, 0.0263,
        0.0263, 0.0388, 0.0194, 0.0291,

RuntimeError: stack expects each tensor to be equal size, but got [129, 1024] at entry 0 and [127, 1024] at entry 1

#### Hybrid pooling

#### Pipelines

Dynamic-mean pipeline

Static-mean pipeline

Dynamic-tf-idf pipeline

Static-tf-idf pipeline

Dynamic-hybrid pipeline

Static-hybrid pipeline