Install necessary libraries



In [None]:
!pip install datasets --quiet
!pip install pandas --quiet
!pip install transformers --quiet

In [None]:
from datasets import load_dataset
import pandas as pd
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

Load the dataset

In [None]:
dataset = load_dataset("tatsu-lab/alpaca")

Convert dataset to DataFrame

In [None]:
df = pd.DataFrame(dataset['train'])
df_text = df['text']

Prepare DataFrame for translations


In [None]:
df = df.drop(columns=['text'])
df['Tshivenda_instruction'] = ''
df['Tshivenda_input'] = ''
df['Tshivenda_output'] = ''
df_text.head()

0    Below is an instruction that describes a task....
1    Below is an instruction that describes a task....
2    Below is an instruction that describes a task....
3    Below is an instruction that describes a task....
4    Below is an instruction that describes a task....
Name: text, dtype: object

Load the Hugging Face model and tokenizer

In [None]:
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

Define translation function using Hugging Face model

In [None]:
def translate_hf(input_text, source_lang="en", target_lang="ur"):
    tokenizer.src_lang = source_lang
    encoded_input = tokenizer(input_text, return_tensors="pt")
    generated_tokens = model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
    )
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translated_text

Calculate total characters in DataFrame

In [None]:
def total_characters_in_dataframe(df):
    total_characters = 0
    for value in df.values.flatten():
        total_characters += len(str(value))
    return total_characters

In [None]:
characters = total_characters_in_dataframe(df)
price = characters / 1000000 * 0 #free model (OS)
print(f"Total characters: {characters}")
print(f"Total price ${price}")

Total characters: 18358818
Total price $0.0


Translate each row in the DataFrame

In [None]:
for i in range(len(df)):
    instruction = df.at[i, 'instruction']
    if instruction != '':
        Tshivenda_instruction = translate_hf(instruction)
        df.at[i, 'Tshivenda_instruction'] = Tshivenda_instruction

    input_text = df.at[i, 'input']
    if input_text != '':
        Tshivenda_input = translate_hf(input_text)
        df.at[i, 'Tshivenda_input'] = Tshivenda_input

    output = df.at[i, 'output']
    if output != '':
        Tshivenda_output = translate_hf(output)
        df.at[i, 'Tshivenda_output'] = Tshivenda_output

    # For demonstration, limit to first 5 rows
    if i >= 5:
        break

Display the DataFrame

In [None]:
df.head()

Unnamed: 0,instruction,input,output,Tshivenda_instruction,Tshivenda_input,Tshivenda_output
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...,صحت مند رہنے کے لئے تین تجاویز,,1.ایک متوازن غذا کھائیں اور اس بات کو یقینی بن...
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",تین اہم رنگ کیا ہیں؟,,تین بنیادی رنگیں سرخ، نیلے اور زرد ہیں.
2,Describe the structure of an atom.,,"An atom is made up of a nucleus, which contain...",ایک اتم کی ساخت کی وضاحت کریں.,,ایک اتم ایک نوکری سے بنایا جاتا ہے، جس میں پرو...
3,How can we reduce air pollution?,,There are a number of ways to reduce air pollu...,فضائی آلودگی کو کیسے کم کیا جا سکتا ہے؟,,فضائی آلودگی کو کم کرنے کے کئی طریقے ہیں، جیسے...
4,Describe a time when you had to make a difficu...,,I had to make a difficult decision when I was ...,ایک وقت کی وضاحت کریں جب آپ کو ایک مشکل فیصلہ ...,,میں نے ایک تعمیراتی کمپنی میں ایک پروجیکٹ مینی...


Save DataFrame to a CSV file

In [None]:
df.to_csv('Urdu.csv', index=False)