In [3]:
import pandas as pd

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "xlm-roberta-base"  # You can also use bert-tiny-amharic if available
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=5)  # Adjust num_labels based on your entity types (e.g., products, prices, locations)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from datasets import load_dataset

# Load the dataset with `trust_remote_code=True`
dataset = load_dataset("conll2003", split="train", trust_remote_code=True)

# Check the first few rows to ensure it's loaded correctly
print(dataset[0])


{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [6]:
from transformers import AutoTokenizer

# Load the XLM-Roberta tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")




In [7]:

# Load the entire data from the CSV file
data_file_path = r'C:\Users\Yibabe\Desktop\10academyAIMweek-5\data\cleaned_tokenized_and_labled_data.csv' 
df = pd.read_csv(data_file_path) 


In [4]:
df.head()

Unnamed: 0,Labeled_Message
0,Car B-PRODUCT\nAromatherapy I-PRODUCT\nSolar I...
1,Car B-PRODUCT\nAromatherapy I-PRODUCT\nSolar I...
2,GW O\nHAIR O\nDRYER/Blower O\nየፀጉር O\nማድረቂያ O\...
3,2 B-PRODUCT\nin I-PRODUCT\n1 I-PRODUCT\nPorcel...
4,Plastic B-PRODUCT\nAnd I-PRODUCT\nMetal I-PROD...


In [8]:

# Assuming the relevant text column is named 'Labeled_Message'; adjust this if needed
if 'Labeled_Message' not in df.columns:
    raise ValueError("The 'Labeled_Message' column is not found in the DataFrame.")

# Redundant messages to be removed
redundant_message = [
    "ከፍለው O",
    "ካሉበት O",
    "እንልካለን O",
    "ክፍለሃገር O",
    "ላላችሁ O",
    "ደንበኞቻችን O",
    "የፈለጉትን O",
    "ዕቃ O",
    "በመልዕክት O",
    "እንልክልዎታለን O",
    "ለማዘዝ O",
    "@ordermertteka1 O",
    "@ordermertteka2 O",
    "ለወዳጅዎ O",
    "forward O",
    "በማድረግ O",
    "ይተባበሩን O",
    "0944-22-23-24 O",
    "0904-94-48-48 O",
    "አድራሻችን O",
    "መገናኛ I-LOC",
    "ዘፍመሽ O",
    "ግራንድ O",
    "ሞል O",
    "3ኛ O",
    "ፎቅ O",
    "ከሊፍት O",
    "ሲወርዱ O",
    "ወደ O",
    "ቀኝ O",
    "ታጥፈው O",
    "ቀጥታ O",
    "376 I-LOC",
    "በኪስዎ O",
    "ጥሬ O",
    "ገንዘብ O",
    "ካልያዙ O",
    "በሞባይል O",
    "ማስተላለፍ O",
    "ይችላሉ። O",
    "ይሄንን O",
    "t.me/MerttEka O",
    "ተጭነው O",
    "join O",
    "ያድርጉ፣ O",
    "ቤተሰብ O",
    "ይሁኑ O"
]

# Function to clean redundant messages
def clean_redundant_messages(text):
    # Check if the text is a string
    if isinstance(text, str):
        for message in set(redundant_message):
            text = text.replace(message + "\n", "")
        return text.strip()  # Remove leading/trailing whitespace after replacement
    return None  # Return None for non-string entries

# Clean the redundant messages in the DataFrame
df['cleaned_text'] = df['Labeled_Message'].apply(clean_redundant_messages)

# Function to split tokens and labels
def split_tokens_and_labels(text):
    if isinstance(text, str):  # Ensure the input is a string
        lines = text.strip().split('\n')
        tokens = []
        labels = []
        for line in lines:
            if line.strip():  # Ensure the line is not empty
                token, label = line.rsplit(' ', 1)
                tokens.append(token)
                labels.append(label)
        return tokens, labels
    else:
        return [], []  # Return empty lists for non-string entries

# Apply the token splitting
df['tokens_labels'] = df['cleaned_text'].apply(split_tokens_and_labels)

# Separate tokens and labels into two new columns
df['tokens'] = df['tokens_labels'].apply(lambda x: x[0])
df['labels'] = df['tokens_labels'].apply(lambda x: x[1])

# Save the cleaned DataFrame to a new CSV file
# output_file_path = r'C:\Users\Yibabe\Desktop\10academyAIMweek-5\data\cleaned_data.csv' # Update this to your desired output path
# df.to_csv(output_file_path, index=False)




In [9]:
# Display the resulting DataFrame (optional)
df[['cleaned_text', 'tokens', 'labels']]

Unnamed: 0,cleaned_text,tokens,labels
0,Car B-PRODUCT\nAromatherapy I-PRODUCT\nSolar I...,"[Car, Aromatherapy, Solar, Vortex, ይሁኑ]","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, O]"
1,Car B-PRODUCT\nAromatherapy I-PRODUCT\nSolar I...,"[Car, Aromatherapy, Solar, Vortex, የመኪና, መዓዛ, ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, O..."
2,GW O\nHAIR O\nDRYER/Blower O\nየፀጉር O\nማድረቂያ O\...,"[GW, HAIR, DRYER/Blower, የፀጉር, ማድረቂያ, ፎን, 6000...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,2 B-PRODUCT\nin I-PRODUCT\n1 I-PRODUCT\nPorcel...,"[2, in, 1, Porcelain, Dessert, Bowel, የሰላጣ, እና...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."
4,Plastic B-PRODUCT\nAnd I-PRODUCT\nMetal I-PROD...,"[Plastic, And, Metal, Cubic, Cloth, Cabinet, ዘ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."
...,...,...,...
4078,2500 B-PRODUCT,[2500],[B-PRODUCT]
4079,ዋጋ B-PRODUCT\n2500 I-PRODUCT\n0983063957 O,"[ዋጋ, 2500, 0983063957]","[B-PRODUCT, I-PRODUCT, O]"
4080,የሙያ B-PRODUCT\nባለቤት I-PRODUCT\nመሆን I-PRODUCT\n...,"[የሙያ, ባለቤት, መሆን, መሠልጠን, ነው።, ቀለም, ቀቢ, ሳያስፈልግዎ,...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."
4081,ቤትና B-PRODUCT\nግቢዎን I-PRODUCT\nእንዲሁም I-PRODUCT...,"[ቤትና, ግቢዎን, እንዲሁም, የብረት, እና, የእንጨት, ቁሳቁስዎን, ቀለ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."


In [8]:
df['tokens'].head()

0              [Car, Aromatherapy, Solar, Vortex, ይሁኑ]
1    [Car, Aromatherapy, Solar, Vortex, የመኪና, መዓዛ, ...
2    [GW, HAIR, DRYER/Blower, የፀጉር, ማድረቂያ, ፎን, 6000...
3    [2, in, 1, Porcelain, Dessert, Bowel, የሰላጣ, እና...
4    [Plastic, And, Metal, Cubic, Cloth, Cabinet, ዘ...
Name: tokens, dtype: object

In [9]:
df['labels'].head()

0      [B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, O]
1    [B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, O...
2    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
3    [B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I...
4    [B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I...
Name: labels, dtype: object

In [9]:

# Function to split cleaned text into tokens and labels
def split_tokens_labels(row):
    cleaned_text = row['cleaned_text']
    
    # Check if cleaned_text is None or an empty string
    if cleaned_text is None or cleaned_text.strip() == "":
        return ([], []), [], []  # Return empty lists if cleaned_text is not valid
    
    # Split the cleaned text into lines
    lines = cleaned_text.strip().split('\n')
    
    # Initialize lists for tokens and labels
    tokens = []
    labels = []
    
    # Iterate through each line
    for line in lines:
        # Split each line by space
        parts = line.rsplit(' ', 1)  # Split only at the last space
        if len(parts) == 2:  # Ensure there's a token and a label
            token, label = parts
            tokens.append(token)
            labels.append(label)
    
    # Create tokens_labels format as a tuple of lists
    tokens_labels = (tokens, labels)
    
    return tokens_labels, tokens, labels

# Apply the function to the DataFrame
df[['tokens_labels', 'tokens', 'labels']] = df.apply(split_tokens_labels, axis=1, result_type='expand')



In [10]:

# Display the updated DataFrame
df[['cleaned_text', 'tokens_labels', 'tokens', 'labels']].head()

Unnamed: 0,cleaned_text,tokens_labels,tokens,labels
0,Car B-PRODUCT\nAromatherapy I-PRODUCT\nSolar I...,"([Car, Aromatherapy, Solar, Vortex, ይሁኑ], [B-P...","[Car, Aromatherapy, Solar, Vortex, ይሁኑ]","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, O]"
1,Car B-PRODUCT\nAromatherapy I-PRODUCT\nSolar I...,"([Car, Aromatherapy, Solar, Vortex, የመኪና, መዓዛ,...","[Car, Aromatherapy, Solar, Vortex, የመኪና, መዓዛ, ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, O..."
2,GW O\nHAIR O\nDRYER/Blower O\nየፀጉር O\nማድረቂያ O\...,"([GW, HAIR, DRYER/Blower, የፀጉር, ማድረቂያ, ፎን, 600...","[GW, HAIR, DRYER/Blower, የፀጉር, ማድረቂያ, ፎን, 6000...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,2 B-PRODUCT\nin I-PRODUCT\n1 I-PRODUCT\nPorcel...,"([2, in, 1, Porcelain, Dessert, Bowel, የሰላጣ, እ...","[2, in, 1, Porcelain, Dessert, Bowel, የሰላጣ, እና...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."
4,Plastic B-PRODUCT\nAnd I-PRODUCT\nMetal I-PROD...,"([Plastic, And, Metal, Cubic, Cloth, Cabinet, ...","[Plastic, And, Metal, Cubic, Cloth, Cabinet, ዘ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."


In [11]:
# Create a dataset for training
train_data = [(row['tokens'], row['labels']) for _, row in df.iterrows() if row['tokens'] and row['labels']]


In [12]:
df1 = df

In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word in the original sentence
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [14]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')




In [15]:
def tokenize_and_align_labels(examples):
    # Tokenize the tokens column
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    # Create empty labels list
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get the word IDs after tokenization
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                # If there is no word ID, append a label for special tokens (CLS, SEP, etc.)
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Label for the first sub-token of a word
                label_ids.append(label[word_idx])
            else:
                # Label for the subsequent sub-tokens of a word
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    # Add the labels to the tokenized inputs
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


In [16]:
# Apply the tokenization and label alignment
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


In [38]:
pip install transformers[torch] accelerate


Collecting accelerateNote: you may need to restart the kernel to use updated packages.

  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.34.2


In [17]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Assuming df1 is your DataFrame containing the tokenized data
train_df, val_df = train_test_split(df1, test_size=0.1, random_state=42)  # 10% for validation

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Check the datasets
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Training dataset size: 3674
Validation dataset size: 409


In [18]:
train_df

Unnamed: 0,Labeled_Message,cleaned_text,tokens_labels,tokens,labels
3867,ታጣፊ B-PRODUCT\nየመስክ I-PRODUCT\nአልጋ። I-PRODUCT\...,ታጣፊ B-PRODUCT\nየመስክ I-PRODUCT\nአልጋ። I-PRODUCT\...,"([ታጣፊ, የመስክ, አልጋ።, ከጠንካራ, ሸራ, እና, ከጠንካራ, አልሙኒየ...","[ታጣፊ, የመስክ, አልጋ።, ከጠንካራ, ሸራ, እና, ከጠንካራ, አልሙኒየም...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."
810,ProLISS B-PRODUCT\nDEEP I-PRODUCT\nOIL I-PRODU...,ProLISS B-PRODUCT\nDEEP I-PRODUCT\nOIL I-PRODU...,"([ProLISS, DEEP, OIL, FRYER, አሳንቡሳ፣, ቺብስ፣, ዓሳ፣...","[ProLISS, DEEP, OIL, FRYER, አሳንቡሳ፣, ቺብስ፣, ዓሳ፣,...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, O..."
2506,mini B-PRODUCT\nwashing I-PRODUCT\nmachine I-P...,mini B-PRODUCT\nwashing I-PRODUCT\nmachine I-P...,"([mini, washing, machine], [B-PRODUCT, I-PRODU...","[mini, washing, machine]","[B-PRODUCT, I-PRODUCT, I-PRODUCT]"
3653,ሶስተኛው B-PRODUCT\nስጦታ I-PRODUCT,ሶስተኛው B-PRODUCT\nስጦታ I-PRODUCT,"([ሶስተኛው, ስጦታ], [B-PRODUCT, I-PRODUCT])","[ሶስተኛው, ስጦታ]","[B-PRODUCT, I-PRODUCT]"
3468,Hand B-PRODUCT\nmixer I-PRODUCT\nwith I-PRODUC...,Hand B-PRODUCT\nmixer I-PRODUCT\nwith I-PRODUC...,"([Hand, mixer, with, 3, cake, molds, 2100, ብር,...","[Hand, mixer, with, 3, cake, molds, 2100, ብር, ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, I-PRODUCT, I..."
...,...,...,...,...,...
1130,Silcon B-PRODUCT\nShower I-PRODUCT\nBrush I-PR...,Silcon B-PRODUCT\nShower I-PRODUCT\nBrush I-PR...,"([Silcon, Shower, Brush, ፈሳሽ, ሳሙና, ማስቀመጫ, አለው,...","[Silcon, Shower, Brush, ፈሳሽ, ሳሙና, ማስቀመጫ, አለው, ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, O, O, O, ..."
1294,Wass B-PRODUCT\nMitad I-PRODUCT\nዋስ O\nምጣድ O\n...,Wass B-PRODUCT\nMitad I-PRODUCT\nዋስ O\nምጣድ O\n...,"([Wass, Mitad, ዋስ, ምጣድ, 16, inch(41, cm), ስፋት,...","[Wass, Mitad, ዋስ, ምጣድ, 16, inch(41, cm), ስፋት, ...","[B-PRODUCT, I-PRODUCT, O, O, I-PRICE, O, O, O,..."
860,Flat B-PRODUCT\nMop I-PRODUCT\nSet I-PRODUCT\n...,Flat B-PRODUCT\nMop I-PRODUCT\nSet I-PRODUCT\n...,"([Flat, Mop, Set, የወለልና, የመስታወት, መወልወያ, ትልቁ, መ...","[Flat, Mop, Set, የወለልና, የመስታወት, መወልወያ, ትልቁ, መጠ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, O, O, O, ..."
3507,Serving B-PRODUCT\ntray I-PRODUCT\nለዳቦ O\nለእንጀ...,Serving B-PRODUCT\ntray I-PRODUCT\nለዳቦ O\nለእንጀ...,"([Serving, tray, ለዳቦ, ለእንጀራ, ለፈንዲሻ, እና, ሌሎች, ም...","[Serving, tray, ለዳቦ, ለእንጀራ, ለፈንዲሻ, እና, ሌሎች, ምግ...","[B-PRODUCT, I-PRODUCT, O, O, O, O, O, O, O, O,..."


In [19]:
val_dataset

Dataset({
    features: ['Labeled_Message', 'cleaned_text', 'tokens_labels', 'tokens', 'labels', '__index_level_0__'],
    num_rows: 409
})

In [20]:
train_dataset

Dataset({
    features: ['Labeled_Message', 'cleaned_text', 'tokens_labels', 'tokens', 'labels', '__index_level_0__'],
    num_rows: 3674
})

In [21]:
def tokenize_and_align_labels(examples):
    # Tokenize the inputs
    tokenized_inputs = tokenizer(examples['tokens'], padding='max_length', truncation=True, is_split_into_words=True)

    # Initialize labels
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(i)  # Get word ids corresponding to the tokens
        label_ids = [-100] * len(tokenized_inputs['input_ids'])  # Default to -100 (ignore index)

        for word_idx in set(word_ids):
            if word_idx is None: 
                continue  # Skip special tokens
            # Use the label for the first token of the word
            label_ids[word_ids.index(word_idx)] = label[word_idx]

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs


In [22]:

# Apply the tokenization and label alignment
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 14041/14041 [00:16<00:00, 856.57 examples/s]


In [1]:
print(tokenized_dataset.column_names)  # This should show 'input_ids', 'attention_mask', and 'labels'


NameError: name 'tokenized_dataset' is not defined

In [25]:
from transformers import TrainingArguments, Trainer

# Step 7: Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for model predictions and checkpoints
    evaluation_strategy="epoch",  # Evaluation strategy to adopt during training
    save_strategy="epoch",  # Save strategy to adopt during training
    learning_rate=2e-5,  # Learning rate for optimization
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,  # Total number of training epochs
    weight_decay=0.01,  # Strength of weight decay
    save_total_limit=2,  # Limit the total amount of checkpoints
    load_best_model_at_end=True,  # Load the best model when finished training
)


In [26]:

# Step 8: Create Trainer
trainer = Trainer(
    model=model,  # The instantiated 🤗 Transformers model to be trained
    args=training_args,  # Training arguments, defined above
    train_dataset=tokenized_dataset,  # Training dataset
    eval_dataset=tokenized_dataset,  # Evaluation dataset
)



In [27]:

# Step 9: Fine-tune the model
trainer.train()



  0%|          | 0/2634 [00:00<?, ?it/s]