In [11]:
from datasets import load_dataset
import pandas as pd

In [2]:
from transformers import MarianMTModel, MarianTokenizer

# Load model English -> Indonesian
model_name = 'Helsinki-NLP/opus-mt-en-id'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)




In [9]:
from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [None]:
import torch
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 2050
Memory Usage:
Allocated: 0.3 GB
Cached:    0.3 GB


In [13]:
import torch
#Select the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(54796, 512, padding_idx=54795)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(54796, 512, padding_idx=54795)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [14]:
# Fungsi translate
def translate(text):
    batch = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt").to(device)
    translated = model.generate(**batch)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [15]:
# Contoh data (ganti ini dengan dataset kamu)
df = pd.DataFrame({
    'text': ['Hello, how are you?', 'I love learning new things.', 'This is a test.']
})
# Terapkan ke kolom tertentu
df['text_id'] = df['text'].progress_apply(translate)

print(df)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

100%|██████████| 3/3 [00:01<00:00,  2.70it/s]

                          text                          text_id
0          Hello, how are you?                 Halo, apa kabar?
1  I love learning new things.  Saya suka belajar hal-hal baru.
2              This is a test.                Ini adalah ujian.





In [16]:
# Menggunakan Dataset Hugging Face
dataset = load_dataset("nazimali/quran-question-answer-context")
dataset.set_format(type='pandas')

df = dataset['train'][:]
df.head()

Unnamed: 0,q_id,question,answer,q_word,q_topic,fine_class,class,ontology_concept,ontology_concept2,source,...,ontology_concept2_en,ontology_concept_en,q_topic_en,q_word_en,question_en,chapter_name_en,verse_list,context,context_data,context_missing_verses
0,1,ما هو الكتاب الوحيد الذي لا يوجد أي ريب أو شك ...,هو كتاب الله (القرآن الكريم) . والدليل : الم{1...,ما,الكتاب,كتاب مقدس,كيان,القرآن,القرآن الكريم,QA,...,The Noble Quran,The Quran,The book,Not,What is the only book that is free from any do...,The Cow,"[1, 2]","Quran Surah 2 Ayah 1:\nAlif, Lam, Mim. (Only A...","[{""index"": 0, ""q_src_id"": 1, ""surah"": 2, ""ayah...",
1,2,هل ثمار الجنة تشبه ثمار الدنيا ؟,نعم . والدليل :وَبَشِّرِ الَّذِين آمَنُواْ وَع...,هل,ثمر الجنة,نعم-لا,تقريري,ثمر الجنة,صفاتها,QA,...,Her Qualities,Fruits of Paradise,Fruit of Paradise,Are,Are the fruits of paradise similar to the frui...,The Cow,[25],"Quran Surah 2 Ayah 25:\nAnd, (O Beloved Prophe...","[{""index"": 1, ""q_src_id"": 2, ""surah"": 2, ""ayah...",
2,3,كم موت وكم حياة للبشر ؟,موتان وحياتان . والدليل :كَيْفَ تَكْفُرُونَ بِ...,كم,موت وحياة,عدد,رقم,الموت,الإنسان وخلقه,QA,...,Man And His Nature,Death,Death and life,How Much,How many deaths and how many lives do humans h...,The Cow,[28],Quran Surah 2 Ayah 28:\nHow Can You Deny Allah...,"[{""index"": 2, ""q_src_id"": 3, ""surah"": 2, ""ayah...",
3,4,ما عدد السموات ؟,سبع سماوات . والدليل :هُوَ الَّذِي خَلَقَ لَكُ...,ما,السموات,عدد,رقم,السموات,حقائق في الكون,QA,...,Facts In The Universe,The heavens,heavens,No,How many heavens are there?,The Cow,[29],Quran Surah 2 Ayah 29:\nHe Is The One Who Crea...,"[{""index"": 3, ""q_src_id"": 4, ""surah"": 2, ""ayah...",
4,5,ماذا تعلم آدم عليه السلام من الله جل جلاله وكا...,علم الأسماء . والدليل : وَعَلَّمَ آدَمَ الأَسْ...,ماذا,الأسماء,مصطلح,كيان,الأسماء,أعلم,QA,...,I Know,Names,Nouns,What,What did Adam (peace be upon him) learn from A...,The Cow,[31],Quran Surah 2 Ayah 31:\nAnd Allah Taught Adam ...,"[{""index"": 4, ""q_src_id"": 5, ""surah"": 2, ""ayah...",


In [17]:
df = df[['question_en','answer_en']]
df

Unnamed: 0,question_en,answer_en
0,What is the only book that is free from any do...,This is the Book of Allah (the Quran). The evi...
1,Are the fruits of paradise similar to the frui...,"Yes, and the evidence: 'And give good news to ..."
2,How many deaths and how many lives do humans h...,"And you were dead, and He gave you life, then ..."
3,How many heavens are there?,He it is Who created for you all that is in th...
4,What did Adam (peace be upon him) learn from A...,"He taught Adam the names of all things, then H..."
...,...,...
1219,"Indeed, Abraham was forbearing, often turning ...","Abraham, peace be upon him, is patient, dislik..."
1220,Why did Lot (peace be upon him) grieve when th...,He feared for them because they were handsome-...
1221,What does Lut (peace be upon him) mean by his ...,"If I had power and supporters among you, or if..."
1222,"And he said to his young men, 'Put their merch...",Yusuf said to his servants: 'Make the price of...


In [18]:
df['question_id'] = df['question_en'].progress_apply(translate)
df

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

100%|██████████| 1224/1224 [15:14<00:00,  1.34it/s]


Unnamed: 0,question_en,answer_en,question_id
0,What is the only book that is free from any do...,This is the Book of Allah (the Quran). The evi...,Apa satu - satunya buku yang bebas dari keragu...
1,Are the fruits of paradise similar to the frui...,"Yes, and the evidence: 'And give good news to ...","(Apakah buah-buahan di surga) maksudnya, buah-..."
2,How many deaths and how many lives do humans h...,"And you were dead, and He gave you life, then ...",Berapa banyak kematian dan berapa banyak nyawa...
3,How many heavens are there?,He it is Who created for you all that is in th...,Berapa banyak surga yang ada?
4,What did Adam (peace be upon him) learn from A...,"He taught Adam the names of all things, then H...",Adam tidak pernah disentuh oleh malaikat kecua...
...,...,...,...
1219,"Indeed, Abraham was forbearing, often turning ...","Abraham, peace be upon him, is patient, dislik...",Sesungguhnya Ibrahim itu benar-benar seorang y...
1220,Why did Lot (peace be upon him) grieve when th...,He feared for them because they were handsome-...,Nabi Luth merasa takut kaumnya akan melakukan ...
1221,What does Lut (peace be upon him) mean by his ...,"If I had power and supporters among you, or if...",(Mengapa) lafal Bal di sini menunjukkan makna ...
1222,"And he said to his young men, 'Put their merch...",Yusuf said to his servants: 'Make the price of...,(Dan Yusuf berkata kepada pembantu-pembantunya...


In [19]:
df['answer_id'] = df['answer_en'].progress_apply(translate)
df

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

100%|██████████| 1224/1224 [53:51<00:00,  2.64s/it]   


Unnamed: 0,question_en,answer_en,question_id,answer_id
0,What is the only book that is free from any do...,This is the Book of Allah (the Quran). The evi...,Apa satu - satunya buku yang bebas dari keragu...,(Kitab ini) yakni yang dibaca oleh Muhammad sa...
1,Are the fruits of paradise similar to the frui...,"Yes, and the evidence: 'And give good news to ...","(Apakah buah-buahan di surga) maksudnya, buah-...",(Dan sampaikanlah berita gembira) kabarkanlah ...
2,How many deaths and how many lives do humans h...,"And you were dead, and He gave you life, then ...",Berapa banyak kematian dan berapa banyak nyawa...,"Setelah itu, Dia menghidupkan kalian lagi untu..."
3,How many heavens are there?,He it is Who created for you all that is in th...,Berapa banyak surga yang ada?,(Dialah yang telah menciptakan bagimu segala y...
4,What did Adam (peace be upon him) learn from A...,"He taught Adam the names of all things, then H...",Adam tidak pernah disentuh oleh malaikat kecua...,(Dan diajarkan-Nya kepada Adam nama-nama) maks...
...,...,...,...,...
1219,"Indeed, Abraham was forbearing, often turning ...","Abraham, peace be upon him, is patient, dislik...",Sesungguhnya Ibrahim itu benar-benar seorang y...,Sesungguhnya di antara yang mengikuti jejak da...
1220,Why did Lot (peace be upon him) grieve when th...,He feared for them because they were handsome-...,Nabi Luth merasa takut kaumnya akan melakukan ...,(Maka Yusuf berlaku takabur terhadap mereka ka...
1221,What does Lut (peace be upon him) mean by his ...,"If I had power and supporters among you, or if...",(Mengapa) lafal Bal di sini menunjukkan makna ...,Jika aku mempunyai kekuatan dan dukungan dari ...
1222,"And he said to his young men, 'Put their merch...",Yusuf said to his servants: 'Make the price of...,(Dan Yusuf berkata kepada pembantu-pembantunya...,(Yusuf berkata kepada pembantu-pembantunya) me...


In [24]:
df.to_csv('qa_en_id.csv', sep='|')

# Test

In [None]:
# df = df.drop(['question', 'answer', 'q_word', 'q_topic', 'fine_class', 'class', 'ontology_concept', 'ontology_concept2', 'source', 'chapter_name', 'context_missing_verses'], axis=1)
# df.head()

Unnamed: 0,q_id,q_src_id,quetion_type,chapter_no,verse,answer_en,class_en,fine_class_en,ontology_concept2_en,ontology_concept_en,q_topic_en,q_word_en,question_en,chapter_name_en,verse_list,context,context_data
0,1,1,F,2,12,This is the Book of Allah (the Quran). The evi...,Entity,Holy Book,The Noble Quran,The Quran,The book,Not,What is the only book that is free from any do...,The Cow,"[1, 2]","Quran Surah 2 Ayah 1:\nAlif, Lam, Mim. (Only A...","[{""index"": 0, ""q_src_id"": 1, ""surah"": 2, ""ayah..."
1,2,2,F,2,25,"Yes, and the evidence: 'And give good news to ...",My Report,Yes-No,Her Qualities,Fruits of Paradise,Fruit of Paradise,Are,Are the fruits of paradise similar to the frui...,The Cow,[25],"Quran Surah 2 Ayah 25:\nAnd, (O Beloved Prophe...","[{""index"": 1, ""q_src_id"": 2, ""surah"": 2, ""ayah..."
2,3,3,F,2,28,"And you were dead, and He gave you life, then ...",Number,Number,Man And His Nature,Death,Death and life,How Much,How many deaths and how many lives do humans h...,The Cow,[28],Quran Surah 2 Ayah 28:\nHow Can You Deny Allah...,"[{""index"": 2, ""q_src_id"": 3, ""surah"": 2, ""ayah..."
3,4,4,F,2,29,He it is Who created for you all that is in th...,Number,Number,Facts In The Universe,The heavens,heavens,No,How many heavens are there?,The Cow,[29],Quran Surah 2 Ayah 29:\nHe Is The One Who Crea...,"[{""index"": 3, ""q_src_id"": 4, ""surah"": 2, ""ayah..."
4,5,5,D,2,31,"He taught Adam the names of all things, then H...",Entity,Term,I Know,Names,Nouns,What,What did Adam (peace be upon him) learn from A...,The Cow,[31],Quran Surah 2 Ayah 31:\nAnd Allah Taught Adam ...,"[{""index"": 4, ""q_src_id"": 5, ""surah"": 2, ""ayah..."


In [None]:
# column_names = [
#     # 'answer_en',
#     # 'class_en',
#     # 'fine_class_en',
#     # 'ontology_concept2_en',
#     'ontology_concept_en',
#     'q_topic_en',
#     'q_word_en',
#     'question_en',
#     'chapter_name_en',
#     'verse_list',
#     'context',
#     ]

In [None]:
# for col in column_names:
#     df[f'{col}_id'] = df[col].apply(translate)

In [None]:
# df['ontology_concept_en_id'] = df['ontology_concept_en'].apply(translate)


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
# df.to_csv('qa_dataset_en_id.csv', sep='|', index=False)

: 