### Installing the necessary packages

In [1]:
import pandas as pd
from transformers import AutoTokenize

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
with open(r'C:\Users\Blen\OneDrive\Desktop\10Academy\LLM\data\labeled_telegram_data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Process lines as needed
data = [line.strip().split('\t') for line in lines]  # Adjust the split based on your delimiter
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,0
0,የእናት B-PRODUCT
1,ጡት I-PRODUCT
2,ወተት I-PRODUCT
3,ማጠራቀሚያ I-PRODUCT
4,ከውስጥ O
...,...
168268,0983063957 O
168269,
168270,ለማዘዝ B-PRODUCT
168271,://. O


### Using amseg tokenizer

In [5]:
from amseg.amharicSegmenter import AmharicSegmenter

In [6]:
# Initialize the Amharic segmenter
sent_punct = []
word_punct = []
segmenter = AmharicSegmenter(sent_punct, word_punct)

In [7]:
# Process lines: separate tokens and labels
data = [line.strip().split() for line in lines if line.strip()]  # Split based on spaces
tokens = [item[0] for item in data]  # Extract tokens
labels = [item[1] for item in data]  # Extract labels

In [8]:
# Function to align tokens with their respective labels
def align_tokens_with_labels(tokenizer, tokens, labels):
    aligned_tokens = []
    aligned_labels = []

    # Tokenize each word/token with associated label
    for word, label in zip(tokens, labels):
        tokenized_word = tokenizer.amharic_tokenizer(word)  # Tokenize the word
        aligned_tokens.extend(tokenized_word)  # Add tokens to the list

        # Assign the label to the first subtoken and 'O' to subsequent subtokens
        aligned_labels.extend([label] + ['O'] * (len(tokenized_word) - 1))

    return aligned_tokens, aligned_labels

In [9]:
# Example usage with different variable names
new_tokens, new_labels = align_tokens_with_labels(segmenter, tokens, labels)

In [10]:
# Output the first 20 aligned tokens and labels
for token, label in zip(new_tokens[:20], new_labels[:20]):
    print(f"{token}: {label}")

የእናት: B-PRODUCT
ጡት: I-PRODUCT
ወተት: I-PRODUCT
ማጠራቀሚያ: I-PRODUCT
ከውስጥ: O
ልብስዎ: O
ጋር: O
አብረው: O
የሚለብሱትና: O
ከጡትዎ: O
የሚፈስ: O
ወተትዎን: O
ልብስዎን: O
ሳያበላሽ: O
በማጠራቀም: O
ለልጅዎ: O
ያጠቡታል: O
ልስልስ: O
ካለ: O
ሲልከን: O


In [11]:
# Saving the final tokens and labels to a CSV file
output_df = pd.DataFrame({'Token': new_tokens, 'Label': new_labels})

In [15]:
output_df.head()

Unnamed: 0,Token,Label
0,የእናት,B-PRODUCT
1,ጡት,I-PRODUCT
2,ወተት,I-PRODUCT
3,ማጠራቀሚያ,I-PRODUCT
4,ከውስጥ,O


In [13]:
output_df.to_csv(r'C:\Users\Blen\OneDrive\Desktop\10Academy\LLM\data\final_tokens_labels.csv', index=False)