# Import and environment preparation

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import html
from urllib.parse import unquote

from google.cloud import translate

import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cat '/content/TA Question Generator-1c1bef1dd25f.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/TA Question Generator-1c1bef1dd25f.json'
!echo $GOOGLE_APPLICATION_CREDENTIALS

# Data load and cleaning

In [3]:
SQUAD_DATASET_PATH = 'Datasets/SQuAD/v2.0/train-v2.0-translated.json'

df_squad = pd.read_json(SQUAD_DATASET_PATH)
df_squad = df_squad.drop(columns=['version'])
print(df_squad.shape)
df_squad.head()

(442, 1)


Unnamed: 0,data
0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{..."
1,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,{'title': 'Sino-Tibetan_relations_during_the_M...
3,"{'title': 'IPod', 'paragraphs': [{'qas': [{'qu..."
4,{'title': 'The_Legend_of_Zelda:_Twilight_Princ...


In [None]:
for key in df_squad['data'][0].keys():
    print(key)

title
paragraphs


In [4]:
df_squad['title'] = df_squad['data'].apply(lambda x: x.get('title'))
df_squad['paragraphs'] = df_squad['data'].apply(lambda x: x.get('paragraphs'))
df_squad.drop(columns=['data'], inplace=True)
df_squad.head()

Unnamed: 0,title,paragraphs
0,Beyoncé,[{'qas': [{'question': 'When did Beyonce start...
1,Frédéric_Chopin,"[{'qas': [{'question': ""What was Frédéric's na..."
2,Sino-Tibetan_relations_during_the_Ming_dynasty,[{'qas': [{'question': 'Who were Wang Jiawei a...
3,IPod,[{'qas': [{'question': 'Which company produces...
4,The_Legend_of_Zelda:_Twilight_Princess,[{'qas': [{'question': 'What category of game ...


In [5]:
def replace_underscore_with_space(text):
      return text.replace('_', ' ')

df_squad['title'] = df_squad['title'].apply(unquote)
df_squad['title'] = df_squad['title'].apply(replace_underscore_with_space)
# for title in df_squad.title:
  # print(title)

df_squad['paragraphs'] = df_squad['paragraphs'].apply(html.unescape)
# df_squad['paragraphs']

In [None]:
chosen_idx = 0
paragraphs = df_squad['paragraphs'][chosen_idx]
print(f"Total paragraph of paragraphs[{chosen_idx}]: {len(paragraphs)}")

chosen_paragraph_idx = 0
paragraph = paragraphs[chosen_paragraph_idx]
print()
print('Key(s) of paragraph:')
for key in paragraph:
    print(f'   {key}')

print()
print(f"Length of qas: {len(paragraph['qas'])}. Element(s):")
for key in paragraph['qas']:
    print(f'   {key}')
    
print()
print('Paragraph\'s context is:')
print(paragraph['context'])

Total paragraph of paragraphs[0]: 66

Key(s) of paragraph:
   qas
   context

Length of qas: 15. Element(s):
   {'question': 'Kapan Beyonce mulai menjadi populer?', 'id': '56be85543aeaaa14008c9063', 'answers': [{'text': 'pada akhir 1990-an', 'answer_start': 269}], 'is_impossible': False}
   {'question': 'Di area apa Beyonce berkompetisi ketika dia tumbuh dewasa?', 'id': '56be85543aeaaa14008c9065', 'answers': [{'text': 'bernyanyi dan menari', 'answer_start': 207}], 'is_impossible': False}
   {'question': 'Kapan Beyonce meninggalkan Destiny&#39;s Child dan menjadi penyanyi solo?', 'id': '56be85543aeaaa14008c9066', 'answers': [{'text': '2003', 'answer_start': 526}], 'is_impossible': False}
   {'question': 'Di kota dan negara bagian mana Beyonce tumbuh?', 'id': '56bf6b0f3aeaaa14008c9601', 'answers': [{'text': 'Houston, Texas', 'answer_start': 166}], 'is_impossible': False}
   {'question': 'Di dekade mana Beyonce menjadi terkenal?', 'id': '56bf6b0f3aeaaa14008c9602', 'answers': [{'text': 'ak

## Deep reading one of the data

In [None]:
df_chopin = df_squad.loc[df_squad.title == 'Frédéric_Chopin']
df_chopin

Unnamed: 0,title,paragraphs
1,Frédéric_Chopin,"[{'qas': [{'question': ""What was Frédéric's na..."


In [None]:
chopin_context_0 = df_chopin['paragraphs'].iloc[0][0]['context']
df_chopin_context_0 = pd.DataFrame(df_chopin['paragraphs'].iloc[0][0]['qas'])
df_chopin_context_0.drop(columns=['id', 'is_impossible'], inplace=True)
df_chopin_context_0['answers'] = df_chopin_context_0['answers'].apply(lambda x: x[0]['text'])
print(chopin_context_0)
df_chopin_context_0

Frédéric François Chopin (/ˈʃoʊpæn/; French pronunciation: ​[fʁe.de.ʁik fʁɑ̃.swa ʃɔ.pɛ̃]; 22 February or 1 March 1810 – 17 October 1849), born Fryderyk Franciszek Chopin,[n 1] was a Polish and French (by citizenship and birth of father) composer and a virtuoso pianist of the Romantic era, who wrote primarily for the solo piano. He gained and has maintained renown worldwide as one of the leading musicians of his era, whose "poetic genius was based on a professional technique that was without equal in his generation." Chopin was born in what was then the Duchy of Warsaw, and grew up in Warsaw, which after 1815 became part of Congress Poland. A child prodigy, he completed his musical education and composed his earlier works in Warsaw before leaving Poland at the age of 20, less than a month before the outbreak of the November 1830 Uprising.


Unnamed: 0,question,answers
0,What was Frédéric's nationalities?,Polish and French
1,In what era was Frédéric active in?,Romantic era
2,For what instrument did Frédéric write primari...,solo piano
3,In what area was Frédéric born in?,Duchy of Warsaw
4,At what age did Frédéric depart from Poland?,20
5,What year was Chopin born?,1810
6,What era was Chopin active during?,Romantic era
7,Where did Chopin grow up?,Warsaw
8,What instrument did he mostly compose for?,solo piano
9,At what age did Chopin leave Poland?,20


# Translate

In [None]:
client = translate.Client()
def translate_to_id(text):
    return html.unescape(client.translate(text, 'id')['translatedText'])

In [None]:
df_chopin_context_0['question'] = df_chopin_context_0['question'].apply(translate_to_id)
df_chopin_context_0['answers'] = df_chopin_context_0['answers'].apply(translate_to_id)
df_chopin_context_0

In [None]:
df_squad_translated = df_squad.copy()

In [None]:
df_squad_translated['title'] = df_squad_translated['title'].apply(translate_to_id)
df_squad_translated

Unnamed: 0,title,paragraphs
0,Beyonce,[{'qas': [{'question': 'When did Beyonce start...
1,Frédéric Chopin,"[{'qas': [{'question': ""What was Frédéric's na..."
2,Hubungan Tiongkok-Tibet selama dinasti Ming,[{'qas': [{'question': 'Who were Wang Jiawei a...
3,IPod,[{'qas': [{'question': 'Which company produces...
4,The Legend of Zelda: Twilight Princess,[{'qas': [{'question': 'What category of game ...
...,...,...
437,Infeksi,[{'qas': [{'question': 'Of the huge amount of ...
438,Berburu,[{'qas': [{'question': 'What is the practice o...
439,Kathmandu,[{'qas': [{'question': 'What country is Kathma...
440,Infark miokard,[{'qas': [{'plausible_answers': [{'text': 'Myo...


In [None]:
TRANSLATED_IDX = 255

i = -1
for topic in df_squad_translated['paragraphs']:
    i += 1
    print(f'Now translating {i}')
    if i < TRANSLATED_IDX:
    print('Translated. Skipping...')
    continue
    for paragraphs in topic:
    for qas in paragraphs['qas']:
        try:
            qas['question'] = translate_to_id(qas['question'])
        except:
            time.sleep(100)
            qas['question'] = translate_to_id(qas['question'])
            print('Limit exceeded.. Sleeping for 100 seconds...')
        for answer in qas['answers']:
        try:
            answer['text'] = translate_to_id(answer['text'])
        except:
            time.sleep(100)
            answer['text'] = translate_to_id(answer['text'])
            print('Limit exceeded.. Sleeping for 100 seconds...')
        # print(qas)
    try:
        paragraphs['context'] = translate_to_id(paragraphs['context'])
    except:
        time.sleep(100)
        paragraphs['context'] = translate_to_id(paragraphs['context'])
        print('Limit exceeded.. Sleeping for 100 seconds...')
        # print(context)

Now translating 0
Translated. Skipping...
Now translating 1
Translated. Skipping...
Now translating 2
Translated. Skipping...
Now translating 3
Translated. Skipping...
Now translating 4
Translated. Skipping...
Now translating 5
Translated. Skipping...
Now translating 6
Translated. Skipping...
Now translating 7
Translated. Skipping...
Now translating 8
Translated. Skipping...
Now translating 9
Translated. Skipping...
Now translating 10
Translated. Skipping...
Now translating 11
Translated. Skipping...
Now translating 12
Translated. Skipping...
Now translating 13
Translated. Skipping...
Now translating 14
Translated. Skipping...
Now translating 15
Translated. Skipping...
Now translating 16
Translated. Skipping...
Now translating 17
Translated. Skipping...
Now translating 18
Translated. Skipping...
Now translating 19
Translated. Skipping...
Now translating 20
Translated. Skipping...
Now translating 21
Translated. Skipping...
Now translating 22
Translated. Skipping...
Now translating 23
Tr

In [None]:
df_squad_translated.to_json('Datasets/SQuAD/v2.0/train-v2.0-translated.json')

In [8]:
translator = Translator.translator(src='en', dest='id')

In [23]:
df_squad_translated = pd.read_json('Datasets/SQuAD/v2.0/train-v2.0-translated.json')
print(df_squad['paragraphs'][0][0]['qas'][-2]['question'])
print(df_squad_translated['paragraphs'][0][0]['qas'][-2]['question'])

What was Beyoncé's role in Destiny's Child?
Apa peran Beyonce dalam Destiny's Child?


In [17]:
df_squad['paragraphs'][topic_idx][context_idx]['qas'][0]

{'question': 'At what age did Frédéric move to Paris?',
 'id': '56cbd2f96d243a140015ed70',
 'answers': [{'text': '21', 'answer_start': 14}],
 'is_impossible': False}