### get the dataset
https://huggingface.co/datasets/KisanVaani/agriculture-qa-english-only

In [34]:
# !pip install datasets
from datasets import load_dataset
from tqdm import tqdm

en_ds = load_dataset("KisanVaani/agriculture-qa-english-only", split="train")
print(en_ds)

Dataset({
    features: ['question', 'answers'],
    num_rows: 22615
})


In [35]:
# test the dataset
sample_row = en_ds[1]
print("Question:", sample_row['question'])
print("Answer:", sample_row['answers'])


Question: What farming practice helps prevent soil erosion?
Answer: Crop Rotation


### load the model

In [36]:
# git clone https://github.com/quickmt/quickmt.git
# uv pip install ./quickmt/
# quickmt-model-download quickmt/quickmt-en-ar ./quickmt-en-ar


In [37]:
from quickmt import Translator

# Auto-detects GPU, set to "cpu" to force CPU inference
t = Translator("./quickmt-en-ar/", device="auto")


In [38]:
# Translate - set beam size to 5 for higher quality (but slower speed)
sample_text = "\n".join([sample_row['question'], sample_row['answers']])
sample_text_ar = t(sample_text, beam_size=7, max_decoding_length=1024)
print(sample_text_ar)


ما هي الممارسات الزراعية التي تساعد على منع تآكل التربة؟
دوران المحاصيل


In [39]:
# make the same input structure from the translated sample_text_ar
question_ar, answers_ar = sample_text_ar.split("\n", 1)
print("Question (AR):", question_ar)
print("Answer (AR):", answers_ar)

Question (AR): ما هي الممارسات الزراعية التي تساعد على منع تآكل التربة؟
Answer (AR): دوران المحاصيل


# batching the dataset in the correct format

In [40]:
# Settings
BATCH_SIZE = 512  # Adjust based on your GPU memory
question_ar_list = []
answers_ar_list = []

# Loop through the dataset in chunks (batches)
for i in tqdm(range(0, len(en_ds), BATCH_SIZE), desc="Translating dataset"):
    # 1. Slice the batch
    batch_indices = slice(i, i + BATCH_SIZE)
    batch = en_ds[batch_indices]
    
    # 2. Prepare inputs: Combine Q and A with Newline
    # We create a list of strings for the model
    src_batch = [
        f"{q}\n{a}" for q, a in zip(batch['question'], batch['answers'])
    ]

    # 3. Translate the whole batch at once
    try:
        translated_batch = t(src_batch, beam_size=7, max_decoding_length=1024, max_batch_size=BATCH_SIZE)
    except Exception as e:
        print(f"Batch translation error at index {i}: {e}")
        # Fill with empty strings to preserve alignment if the whole batch crashes
        translated_batch = ["\n"] * len(src_batch)

    # 4. Process results and handle Split errors safely
    for text in translated_batch:
        if "\n" in text:
            # Success: Split by the first newline found
            q_ar, a_ar = text.split("\n", 1)
            question_ar_list.append(q_ar.strip())
            answers_ar_list.append(a_ar.strip())
        else:
            # Failure: Model deleted the newline separator. 
            # DO NOT CONTINUE. We must append *something* to keep the columns aligned.
            # Strategy: Put the whole text in Question, leave Answer empty/flagged.
            question_ar_list.append(text.strip())
            answers_ar_list.append("[Error: Separator Missing]")

# 5. Add new columns to the existing dataset
# This will now work because len(list) == len(dataset)
en_ds = en_ds.add_column("question_ar", question_ar_list)
en_ds = en_ds.add_column("answers_ar", answers_ar_list)

# quick sanity check
print("Total rows:", len(en_ds))
print(en_ds[0])

Translating dataset: 100%|██████████| 45/45 [05:11<00:00,  6.92s/it]

Total rows: 22615
{'question': 'why is crop rotation important in farming?', 'answers': 'This helps to prevent soil erosion and depletion, and can also help to control pests and diseases', 'question_ar': 'لماذا يعتبر تناوب المحاصيل مهمًا في الزراعة؟', 'answers_ar': 'هذا يساعد على منع تآكل التربة واستنزافها ، ويمكن أن يساعد أيضًا في مكافحة الآفات والأمراض'}





In [41]:
import pandas as pd

# Convert the first 5 rows to a DataFrame to visualize "The Head"
df_head = pd.DataFrame(en_ds[:5])

# Print the DataFrame
df_head

Unnamed: 0,question,answers,question_ar,answers_ar
0,why is crop rotation important in farming?,This helps to prevent soil erosion and depleti...,لماذا يعتبر تناوب المحاصيل مهمًا في الزراعة؟,هذا يساعد على منع تآكل التربة واستنزافها ، ويم...
1,What farming practice helps prevent soil erosion?,Crop Rotation,ما هي الممارسات الزراعية التي تساعد على منع تآ...,دوران المحاصيل
2,what is crop rotation,Crop rotation is the practice of growing a ser...,ما هو دوران المحاصيل,تناوب المحاصيل هو ممارسة زراعة سلسلة من المحاص...
3,what are the different methods of irrigation?,"surface irrigation, drip irrigation, and sprin...",ما هي طرق الري المختلفة؟,الري السطحي، الري بالتنقيط، والري بالرش
4,why is soil health vital?,Soil health is critical to crop growth and pro...,لماذا تعتبر صحة التربة حيوية؟,صحة التربة أمر بالغ الأهمية لنمو المحاصيل والإ...


# search dataset for errors using Pandas.

We need to look for two things:
1.  **Separator Missing:** The specific string we hardcoded.
2.  **Batch Crash:** If `translated_batch` became `["\n"]`, the code `text.split("\n", 1)` would result in **empty strings** for both Question and Answer. So we search for empty translations.


In [45]:
import pandas as pd

# 1. Convert Hugging Face dataset to Pandas DataFrame for searching
df = en_ds.to_pandas()

# 2. Define the filters
# Filter A: Separator Missing (Model ate the newline)
cond_separator_missing = df['answers_ar'] == "[Error: Separator Missing]"

# Filter B: Batch Crash (Empty strings resulting from the ["\n"] fallback)
cond_batch_crash = (df['question_ar'] == "") & (df['answers_ar'] == "")

# Filter C: Extra Newlines (The answer column contains specific newline characters)
cond_has_newline = df['answers_ar'].str.contains("\n", na=False)

# 3. Combine filters (Find rows matching ANY of these conditions)
error_rows = df[cond_separator_missing | cond_batch_crash | cond_has_newline]

# 4. Display findings
print(f"Total Error Rows Found: {len(error_rows)}")

if len(error_rows) > 0:
    print("\n--- Sample of Error Rows ---")
    # Show the English source and the Arabic Result to see what happened
    print(error_rows[['question', 'answers', 'question_ar', 'answers_ar']].head(1))
else:
    print("Great news! No errors found.")

Total Error Rows Found: 1256

--- Sample of Error Rows ---
                                      question  \
82  which other regions in Uganda grow apples?   

                                              answers  \
82  Bugisu, Bukedi and\nSebei sub regions in areas...   

                                         question_ar  \
82  ما هي المناطق الأخرى في أوغندا التي تنمو التفاح؟   

                                           answers_ar  
82  بوغيسو، بوكيدي و\nمناطق سيبي الفرعية في المناط...  


In [46]:
error_rows

Unnamed: 0,question,answers,question_ar,answers_ar
82,which other regions in Uganda grow apples?,"Bugisu, Bukedi and\nSebei sub regions in areas...",ما هي المناطق الأخرى في أوغندا التي تنمو التفاح؟,بوغيسو، بوكيدي و\nمناطق سيبي الفرعية في المناط...
83,what are some of the challenges in growing app...,� Apple seedlings are imported and costly\n� B...,ما هي بعض التحديات في زراعة التفاح؟,شتلات التفاح مستوردة ومكلفة\nالطيور واللصوص تع...
90,when do i harvest cassava plant?,when the plants have attained\ncomplete physio...,متى يمكنني حصاد نبات الكسافا؟,عندما تصل النباتات إلى\nالنضج الفسيولوجي الكامل
167,Is buying organic food better for the environm...,"Ultimately, environmental sustainability comes...",هل شراء الأغذية العضوية أفضل للبيئة من شراء ال...,المنتجة تقليديا؟\nفي نهاية المطاف ، تأتي الاست...
197,Could more people be fed if crop land was used...,land used f,هل يمكن إطعام المزيد من الناس إذا تم استخدام ا...,الاستهلاك بدلا من الماشية أو علف الماشية؟\nالأ...
...,...,...,...,...
21094,what is isolation.,A minimum separation distance required in a se...,ما هي العزلة,الحد الأدنى من مسافة الفصل المطلوبة في حقل ضرب...
21141,why do cassava plants may shed their leaves du...,Water stress: Cassava plants have a deep root ...,لماذا قد تفقد نباتات الكسافا أوراقها خلال موسم...,الإجهاد المائي: تحتوي نباتات الكسافا على نظام ...
21149,Suggest the disadvantages of sandy soil,Poor water-holding capacity: Sandy soils have ...,اقتراح عيوب التربة الرملية,ضعف القدرة على الاحتفاظ بالمياه: تتمتع التربة ...
21171,What are the symptoms of maize streak disease,Stunted growth: Infected plants often show stu...,ما هي أعراض مرض خط الذرة,النمو المتعثر: غالبًا ما تظهر النباتات المصابة...


it's no harm errors

In [None]:
# 1. Save Locally (Optional, but good for backup)
print("Saving dataset locally...")
en_ds.save_to_disk("./translated_dataset_local")

# upload the dataset

In [None]:
# !pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# set private=False so everyone can see it and cite you!
repo_id = "abdulhamed/agriculture_qa_en_ar_pairs"

print(f"Uploading to {repo_id}...")
en_ds.push_to_hub(repo_id, private=False)

print("✅ Done! Your dataset is live at:")
print(f"https://huggingface.co/datasets/{repo_id}")