In [1]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load XLM-Roberta
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModel.from_pretrained("xlm-roberta-base")

In [3]:
df = pd.read_csv("../data/cleaned_message.csv", encoding="utf-8-sig")
messages = df["Message"].astype(str)  # Make sure all are strings


In [5]:
messages

0       ...................................\n\nSaachi ...
1       ...................................\n\n3pcs Bo...
2       ...................................\n\n3pcs Bo...
3       ...................................\n\n1 pairs...
4       ...................................\n\n1 pairs...
                              ...                        
1441     :- NIKE  CRAFT \n\n ዋጋ :- 3800\n\n size :- 40...
1442     :- GIRLS  ADIDAS BAD BUNNY \n\n ዋጋ :- 3200\n\...
1443     :- GIRLS  ADIDAS YEEZY BOOST 700 \n\n ዋጋ :- 3...
1444     :- GIRLS  JORDAN FOUR \n\n ዋጋ :- 3500\n\n siz...
1445     :- GIRLS  JORDAN FOUR COLAB WITH LOUIS VUITTO...
Name: Message, Length: 1446, dtype: object

In [None]:
# Define a regex-based word cleaner  and Word-level tokenization that splits on space or underscore
def word_tokenize(text):
    if not isinstance(text, str):
        return []

    # Split by space or underscore
    raw_tokens = re.split(r'[\s_]+', text.strip())

    # Clean each token
    tokens = []
    for token in raw_tokens:
        token = token.strip()
        token = re.sub(r'^#+', '', token)             # Remove leading #
        # Remove specific punctuation marks including amharic punction mark section mark
        token = re.sub(r'[፡።፣፤፥፦]+', '', token)

        if re.fullmatch(r'[.\-–=•·_]+', token):        # Ignore pure punctuation
            continue
        if token:
            tokens.append(token)

    return tokens

# Apply the tokenizer
df['Word_Tokens'] = df['Message'].astype(str).apply(word_tokenize)

# Save to CSV
df.to_csv("../data/tokenized_messages.csv", index=False, encoding="utf-8-sig")


In [8]:
df['Word_Tokens']

0       [Saachi, Electric, Kettle, Borosilicate, Glass...
1       [3pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠ...
2       [3pcs, Bottle, Stopper, በማንኛውም, ጠርሙስ, ጫፍ, የሚገጠ...
3       [1, pairs, Sneaker, Crease, Protector, ዋጋ፦, 40...
4       [1, pairs, Sneaker, Crease, Protector, ዋጋ፦, 40...
                              ...                        
1441    [NIKE, CRAFT, ዋጋ, 3800, size, 40,41,42,43,44, ...
1442    [GIRLS, ADIDAS, BAD, BUNNY, ዋጋ, 3200, size, 36...
1443    [GIRLS, ADIDAS, YEEZY, BOOST, 700, ዋጋ, 3400, s...
1444    [GIRLS, JORDAN, FOUR, ዋጋ, 3500, size, 36,37,38...
1445    [GIRLS, JORDAN, FOUR, COLAB, WITH, LOUIS, VUIT...
Name: Word_Tokens, Length: 1446, dtype: object