In [1]:
import json

In [4]:
import re

def remove_diacritics(text):

  DIACRITICS = [
      '\u0610', '\u0611', '\u0612', '\u0613', '\u0614', '\u0615', '\u0616', '\u0617', '\u0618', '\u0619',
      '\u061A', '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653',
      '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C', '\u065D',
      '\u065E', '\u065F', '\u0670', '\u06D6', '\u06D7', '\u06D8', '\u06D9', '\u06DA', '\u06DB', '\u06DC',
      '\u06DD', '\u06DE', '\u06DF', '\u06E0', '\u06E1', '\u06E2', '\u06E3', '\u06E4', '\u06E5', '\u06E6',
      '\u06E7', '\u06E8', '\u06E9', '\u06EA', '\u06EB', '\u06EC', '\u06ED', '\u06EE', '\u06EF', '\u06F0',
      '\u06F1', '\u06F2', '\u06F3', '\u06F4', '\u06F5', '\u06F6', '\u06F7', '\u06F8', '\u06F9'
      ]

  diacritics_pattern = re.compile('|'.join(re.escape(diacritic) for diacritic in DIACRITICS))
  return re.sub(diacritics_pattern, "", text)

def clean(text):
  text = text.replace(".", "")
  text = text.replace("،", "")
  text = text.replace("؟", "")
  text = text.replace(":", "")

  text = re.sub(r'^[abcdeABCDEأ ب ج د ه][\.\-\s]{1}', "", text)

  text = re.sub(r"\s+", " ", text)

  text = remove_diacritics(text)

  return text

In [5]:
def clean_qa(qa):
  qa["question"] = clean(qa["question"])
  for option in qa["answer"]:
    if option == "correct_option":
      continue
    qa["answer"][option] = clean(qa["answer"][option])

def remove(qa):
  if "$" in qa["question"]:
    return True
  for i in range(0, len(qa["answer"])-1):
    option = f"option_{i}"
    if "&" in qa["answer"][option]:
      return True
    for c in ["a", "b", "c", "d", "e"]:
      if c in qa["answer"][option]:
        return True
    for c in ["A", "B", "C", "D", "E"]:
      if c in qa["answer"][option]:
        return True
  return False

In [11]:
medicine_qa = []

M2010B = json.load(open("./Medicine/validated/M2010B.json", encoding="utf-8"))
M2015B = json.load(open("./Medicine/validated/M2015B.json", encoding="utf-8"))
M2017B = json.load(open("./Medicine/validated/M2017B.json", encoding="utf-8"))
M2019B = json.load(open("./Medicine/validated/M2019B.json", encoding="utf-8"))
M2023A = json.load(open("./Medicine/validated/M2023A.json", encoding="utf-8"))

M2010B_link = "https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/M2010B.pdf"
M2015B_link = "https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/M2015B.pdf"
M2017B_link = "https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/M2017B.pdf"
M2019B_link = "https://www.mehe.gov.lb/ar/SiteAssets/Pages/Structure/Higher%20EDU/ExamColloqOral/%D8%B7%D8%A8%20%D8%B9%D8%A7%D9%85%20-%202019%20-%20%D8%A7%D9%84%D8%AF%D9%88%D8%B1%D8%A9%20%D8%A7%D9%84%D8%AB%D8%A7%D9%86%D9%8A%D8%A9.pdf"
M2023A_link = "https://www.mehe.gov.lb/ar/Pages/%D9%83%D9%88%D9%84%D9%88%D9%83%D9%8A%D9%88%D9%85/2023/%D8%A7%D9%84%D8%AF%D9%88%D8%B1%D8%A9%20%D8%A7%D9%84%D8%A7%D9%88%D9%84%D9%89/Med2023A.pdf"

for session, link in zip([M2010B, M2015B, M2017B, M2019B, M2023A], [M2010B_link, M2015B_link, M2017B_link, M2019B_link, M2023A_link]):
  for qa in session:
    qa["resource"] = link
    medicine_qa.append(qa)

for qa in medicine_qa:
  clean_qa(qa)

medicine_qa_filtered = []
for qa in medicine_qa:
  if remove(qa):
    pass
  else:
    medicine_qa_filtered.append(qa)


In [12]:
medicine_qa_filtered[0]

{'no': 1,
 'question': 'ممرضة عمرها 40 عاما أدخلت إلى المستشفى بسبب ارتفاع بالحرارة حتى 41°C بالرغم من إجراء ما يلزم في المستشفى لأكثر من ثلاثة أسابيع لم تتضح الأسباب واستمرار ارتفاع الحرارة أكثر من 40°C أن التشخيص الأقل ترجيحا لهذه المريضة هو',
 'answer': {'option_0': 'انتان جرثومي خفي',
  'option_1': 'انفلونزا',
  'option_2': 'ورم لمفاوي',
  'option_3': 'داء ستيللز للكبار',
  'option_4': 'هـ حمى صنعية',
  'correct_option': 1},
 'resource': 'https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/M2010B.pdf'}

In [None]:
json.dump(medicine_qa_filtered, open("./Medicine/medicine_qa.json", "w", encoding="utf-8"))

In [14]:
dentistry_qa = []

D2015B = json.load(open("./Dentistry/validated/D2015B.json", encoding="utf-8"))
D2016B = json.load(open("./Dentistry/validated/D2016B.json", encoding="utf-8"))
D2018A = json.load(open("./Dentistry/validated/D2018A.json", encoding="utf-8"))
D2020A = json.load(open("./Dentistry/validated/D2020A.json", encoding="utf-8"))
D2021B = json.load(open("./Dentistry/validated/D2021B.json", encoding="utf-8"))
D2022A = json.load(open("./Dentistry/validated/D2022A.json", encoding="utf-8"))
D2023A = json.load(open("./Dentistry/validated/D2023A.json", encoding="utf-8"))

D2015B_link = "https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/D2015B.pdf"
D2016B_link = "https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/D2016B.pdf"
D2018A_link = "https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/D2018A.pdf"
D2020A_link = "https://www.mehe.gov.lb/ar/Pages/Publications/Colloquium2020A/D2020A.pdf"
D2021B_link = "https://www.mehe.gov.lb/ar/Pages/%D9%83%D9%88%D9%84%D9%88%D9%83%D9%8A%D9%88%D9%85/2022/D2021B.PDF"
D2022A_link = "https://www.mehe.gov.lb/ar/Pages/%D9%83%D9%88%D9%84%D9%88%D9%83%D9%8A%D9%88%D9%85/2022/2022%20-%20%D8%A7%D9%84%D8%AF%D9%88%D8%B1%D8%A9%20%D8%A7%D9%84%D8%A7%D9%88%D9%84%D9%89/Coll%20Dent%202022A.PDF"
D2023A_link = "https://www.mehe.gov.lb/ar/Pages/%D9%83%D9%88%D9%84%D9%88%D9%83%D9%8A%D9%88%D9%85/2023/%D8%A7%D9%84%D8%AF%D9%88%D8%B1%D8%A9%20%D8%A7%D9%84%D8%A7%D9%88%D9%84%D9%89/Dent2023A.pdf"

for session, link in zip([D2015B , D2016B, D2018A, D2020A, D2021B, D2022A, D2023A], [D2015B_link , D2016B_link, D2018A_link, D2020A_link, D2021B_link, D2022A_link, D2023A_link]):
  for qa in session:
    qa["resource"] = link
    dentistry_qa.append(qa)

for qa in dentistry_qa:
  clean_qa(qa)

dentistry_qa_filtered = []
for qa in dentistry_qa:
  if remove(qa):
    pass
  else:
    dentistry_qa_filtered.append(qa)

In [15]:
dentistry_qa_filtered[0]

{'no': 1,
 'question': 'تتم برمجة المطبق الشبه للتكييف من خلال',
 'answer': {'option_0': 'استعمال سجل للعلاقة المركزية',
  'option_1': 'استعمال سجل الحركة اللامركزية',
  'option_2': 'استعمال سجل حالة تفاعس الفك',
  'option_3': 'استعمال سجل التشابك الأقصى للحدبات',
  'option_4': 'استعمال سجل البروز',
  'correct_option': 1},
 'resource': 'https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/D2015B.pdf'}

In [None]:
json.dump(dentistry_qa_filtered, open("./Dentistry/dentistry_qa.json", "w", encoding="utf-8"))

In [17]:
pharmacy_qa = []

P2010A = json.load(open("./Pharmacy/validated/P2010A.json", encoding="utf-8"))
P2012B = json.load(open("./Pharmacy/validated/P2012B.json", encoding="utf-8"))
P2020A = json.load(open("./Pharmacy/validated/P2020A.json", encoding="utf-8"))
P2022A = json.load(open("./Pharmacy/validated/P2022A.json", encoding="utf-8"))

P2010A_link= "https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/P2010A.pdf"
P2012B_link= "https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/P2012B.pdf"
P2020A_link= "https://www.mehe.gov.lb/ar/Pages/Publications/Colloquium2020A/P2020A.pdf"
P2022A_link= "https://www.mehe.gov.lb/ar/Pages/%D9%83%D9%88%D9%84%D9%88%D9%83%D9%8A%D9%88%D9%85/2022/2022%20-%20%D8%A7%D9%84%D8%AF%D9%88%D8%B1%D8%A9%20%D8%A7%D9%84%D8%A7%D9%88%D9%84%D9%89/Coll%20Pharm%202022A.PDF"

for session, link in zip([P2010A, P2012B, P2020A, P2022A], [P2010A_link, P2012B_link, P2020A_link, P2022A_link]):
  for qa in session:
    qa["resource"] = link
    pharmacy_qa.append(qa)

for qa in pharmacy_qa:
  clean_qa(qa)

pharmacy_qa_filtered = []
for qa in pharmacy_qa:
  if remove(qa):
    pass
  else:
    pharmacy_qa_filtered.append(qa)

In [18]:
pharmacy_qa_filtered[0]

{'no': 2,
 'question': 'تعد أدوية أو بحكم أدوية (حدد الإجابة الخطأ)',
 'answer': {'option_0': 'البدائل الاصطناعية للجهاز العظمي',
  'option_1': 'الأدوات المعقمة ذات المنافع الطبية',
  'option_2': 'المياه المعدنية الطبية',
  'option_3': 'الأمصال والتلاقيح',
  'correct_option': 0},
 'resource': 'https://www.mehe.gov.lb/ar/OrganizationalStructureFiles/Higher%20EDU/P2010A.pdf'}

In [None]:
json.dump(pharmacy_qa_filtered, open("./pharmacy_qa.json", "w", encoding="utf-8"))