### Generate Links

In [None]:
import requests
from bs4 import BeautifulSoup as bs4

In [2]:
beadaya_tests = "https://beadaya.com/tests/"

In [5]:
pages = {
           "1": {"math": [160, 205, 356], "science": [55, 206, 358]}, # Correction math was 160, 205, 256- > 160, 205, 356
           "2": {"math": [161, 210, 362], "science": [65, 209, 364]},
           "3": {"math": [162, 213, 368], "science": [99, 211, 371]},
           "4": {"math": [119, 219, 376], "science": [39, 215, 380]},
           "5": {"math": [120, 226, 386], "science": [9, 224, 389]},
           "6": {"math": [121, 235, 397], "science": [101, 231, 401]},
           "7": {"math": [122, 244, 410], "science": [107, 237, 414]},
           "8": {"math": [123, 196, 421], "science": [126, 256, 422]},
           "9": {"math": [124, 180, 433], "science": [89, 259, 440]},
           "10-11-12": {"math": [343, 345, 344, 346],
                        "physics": [331, 333, 332],
                        "chemistry": [340, 341, 342]}
           }

In [None]:
from collections import defaultdict
from urllib.parse import urljoin
import json

links = defaultdict(list)
for grade in pages.keys():
  for subject in pages[grade].keys():
    for page in pages[grade][subject]:
      print(f"{grade}-{subject}-{page}")
      response = requests.get(urljoin(beadaya_tests, str(page)))
      html = bs4(response.content)
      questions_groups = html.find("ul", {"class":"category-lessons"})
      for group in questions_groups.find_all("li"):
        if not group.find("a"):
          continue
        link = group.find("a")["href"]
        links[f"{grade}-{subject}"].append(link)

In [None]:
json.dump(links, open("beadaya_links.json"), "w")

### Generate QA Raw

In [1]:
import json
import requests
from bs4 import BeautifulSoup as bs4
from collections import defaultdict

In [2]:

def parse_question(beadaya_html_question):
  question_data = {}
  question_data['zad_quiz_questions_order[]'] = beadaya_html_question.find("input", {"type":"hidden"})["value"]
  question = beadaya_html_question.find("div", {"class": "zad-quiz-test-content-question"})
  has_sub_items = question.find("div", {"class": "zad-quiz-test-content-sub-items"})

def get_questions_payload(beadaya_html_test_form):
  payload = {}
  payload["questions_list_id"] = beadaya_html_test_form.find("input", {"name": "questions_list_id"})["value"]

  # Get all questions
  questions = beadaya_html_test_form.find_all("div", {"class": "zad-quiz-test-content-item"}, recursive=False)
  final_questions = []
  # Get sub-questions (Some questions have sub questions)
  for question in questions:
    sub_questions_div = question.find("div", {"class": "zad-quiz-test-content-sub-items"})
    if sub_questions_div:
      sub_questions = sub_questions_div.find_all("div", {"class": "zad-quiz-test-content-item"})
      final_questions.append(question)
      final_questions.extend(sub_questions)
    else:
      final_questions.append(question)

  # For each question get order hidden variable and choose first answer
  payload["zad_quiz_questions_order[]"] = []

  # Collect ansersw name-value
  for question in final_questions:

    # Order hidden variable
    order = question.find("input", {"type":"hidden"})["value"]
    payload['zad_quiz_questions_order[]'].append(order)

    # Answer
    answer = question.find("div", {"class": "zad-quiz-test-content-answer"})
    if not answer: # (Some questions are only hidden no answers or actual question)
      continue
    answer = answer.find("input")
    answer_value = answer["value"]
    if not answer["value"]: # Some questions are text and not mcq set text to "default"
       answer["value"] = "default"
    payload[answer["name"]] = answer_value

  payload["send"] = 1
  return payload

def process_question_answer(beadaya_question_answer_html):
  question_answer = {}

  has_answers = beadaya_question_answer_html.find("div", {"class":"zad-quiz-test-content-answers"}).find_all()!= []
  if not has_answers:
    return question_answer

  question_answer["question"] = {}
  question_div = beadaya_question_answer_html.find("div", {"class": "zad-quiz-test-content-question"})
  question_answer["question"]["img"] = question_div.find("img")["src"] if question_div.find("img") else ""
  question_answer["question"]["text"] =  question_div.find("span").text.strip() if  question_div.find("span") else ""
  question_answer["question"]["math"] = question_div.find("math").prettify() if question_div.find("math") else ""

  sub_questions = beadaya_question_answer_html.find("div", {"class":"zad-quiz-test-content-sub-items"})
  question_answers= []
  if sub_questions:
    for sub_question in sub_questions.find_all("div", {"class": "zad-quiz-test-content-item"}):
      sub_question_answer = question_answer.copy()
      sub_question_answer["sub_question"]= {}

      question_div = sub_question.find("div", {"class": "zad-quiz-test-content-question"})
      sub_question_answer["sub_question"]["img"] = question_div.find("img")["src"] if question_div.find("img") else ""
      sub_question_answer["sub_question"]["text"] =  question_div.find("span").text.strip() if  question_div.find("span") else ""
      sub_question_answer["sub_question"]["math"] = question_div.find("math").prettify() if question_div.find("math") else ""

      sub_question_answer["answer"] = {}
      options =  sub_question.find_all("div", {"class": "zad-quiz-test-answer-option"})
      correct_option = -1
      for i,option in enumerate(options):
        sub_question_answer["answer"][f"option_{i}"] = {}
        sub_question_answer["answer"][f"option_{i}"]["img"] = option.find("img")["src"] if option.find("img") else ""
        sub_question_answer["answer"][f"option_{i}"]["text"] =  option.text.strip()
        sub_question_answer["answer"][f"option_{i}"]["math"] = str(option.find("math")) if option.find("math") else ""
        if option.find("i", {"class": "fa fa-check"}): correct_option = i
      sub_question_answer["answer"]["correct_option"] = correct_option

      question_answers.append(sub_question_answer)
    return question_answers
  else:
    question_answer["answer"]= {}
    options =  beadaya_question_answer_html.find_all("div", {"class": "zad-quiz-test-answer-option"})
    correct_option = -1
    for i,option in enumerate(options):
      question_answer["answer"][f"option_{i}"] = {}
      question_answer["answer"][f"option_{i}"]["img"] = option.find("img")["src"] if option.find("img") else ""
      question_answer["answer"][f"option_{i}"]["text"] =  option.text.strip()
      question_answer["answer"][f"option_{i}"]["math"] = str(option.find("math")) if option.find("math") else ""
      if option.find("i", {"class": "fa fa-check"}): correct_option = i
    question_answer["answer"]["correct_option"] = correct_option
    return question_answer

In [None]:
links = json.load(open("beadaya_links.json", "r"))

In [None]:
total_question_answers= defaultdict(list)
for grade_subject in links.keys():
  print(grade_subject)
  for link in links[grade_subject]:
    print(link)
    s = requests.Session()
    # Generate payload
    payload = {}
    response = s.get(link)
    html = bs4(response.content)
    form = html.find("form", {"id":"zad_quiz_form"})
    payload = get_questions_payload(form)
    # Get Answers
    response = s.post(link, data=payload)
    html = bs4(response.content)
    answers_div = html.find("div", {"class": "test-answers"})
    if not answers_div:
      continue
    answers = answers_div.find_all("div", {"class": "zad-quiz-test-content-item"}, recursive=False)
    for answer_div in answers:
      answers = process_question_answer(answer_div)
      if type(answers) is list:
        for answer in answers:
          answer["link"]=link
        total_question_answers[grade_subject].extend(answers)
      else:
        answers["link"]=link
        total_question_answers[grade_subject].append(answers)

In [None]:
json.dump(total_question_answers, open("beadaya_qa_raw.json", "w"))

### Generate QA

In [6]:
import pandas as pd
import json

In [None]:
beadaya_qa = json.load(open("beadaya_qa_raw.json", "r"))

In [8]:
beadaya_columnar=[]
for grade_subject in beadaya_qa.keys():
  grade, subject = grade_subject.replace("10-11-12", "secondary").split("-")
  for qa in beadaya_qa[grade_subject]:

    if "question" not in qa: # some questions are empty
      continue

    columnar = {}
    columnar["grade"] = grade
    columnar["subject"] = subject

    columnar["question_img"] = qa["question"]["img"]
    columnar["question_text"] = qa["question"]["text"]
    columnar["question_math"] = qa["question"]["math"]

    columnar["sub_question_img"] = qa["sub_question"]["img"] if "sub_question" in qa else ""
    columnar["sub_question_text"] = qa["sub_question"]["text"] if "sub_question" in qa else ""
    columnar["sub_question_math"] = qa["sub_question"]["math"] if "sub_question" in qa else ""

    columnar["option_0_img"] = qa["answer"]["option_0"]["img"] if "option_0" in qa["answer"] else ""
    columnar["option_0_text"] = qa["answer"]["option_0"]["text"] if "option_0" in qa["answer"] else ""
    columnar["option_0_math"] = qa["answer"]["option_0"]["math"] if "option_0" in qa["answer"] else ""

    columnar["option_1_img"] = qa["answer"]["option_1"]["img"] if "option_1" in qa["answer"] else ""
    columnar["option_1_text"] = qa["answer"]["option_1"]["text"] if "option_1" in qa["answer"] else ""
    columnar["option_1_math"] = qa["answer"]["option_1"]["math"] if "option_1" in qa["answer"] else ""

    columnar["option_2_img"] = qa["answer"]["option_2"]["img"] if "option_2" in qa["answer"] else ""
    columnar["option_2_text"] = qa["answer"]["option_2"]["text"] if "option_2" in qa["answer"] else ""
    columnar["option_2_math"] = qa["answer"]["option_2"]["math"] if "option_2" in qa["answer"] else ""

    columnar["option_3_img"] = qa["answer"]["option_3"]["img"] if "option_3" in qa["answer"] else ""
    columnar["option_3_text"] = qa["answer"]["option_3"]["text"] if "option_3" in qa["answer"] else ""
    columnar["option_3_math"] = qa["answer"]["option_3"]["math"]   if "option_3" in qa["answer"] else ""

    columnar["option_4_img"] = qa["answer"]["option_4"]["img"] if "option_4" in qa["answer"] else ""
    columnar["option_4_text"] = qa["answer"]["option_4"]["text"] if "option_4" in qa["answer"] else ""
    columnar["option_4_math"] = qa["answer"]["option_4"]["math"] if "option_4" in qa["answer"] else ""

    columnar["correct_option"] = qa["answer"]["correct_option"]
    columnar["link"] = qa["link"]

    beadaya_columnar.append(columnar)

In [9]:
beadaya_df = pd.DataFrame(beadaya_columnar)

In [None]:
# Remove questions with images
beadaya_df = beadaya_df[beadaya_df["question_img"]==""]
beadaya_df = beadaya_df[beadaya_df["sub_question_img"]==""]
beadaya_df = beadaya_df[beadaya_df["option_0_img"]==""]
beadaya_df = beadaya_df[beadaya_df["option_1_img"]==""]
beadaya_df = beadaya_df[beadaya_df["option_2_img"]==""]
beadaya_df = beadaya_df[beadaya_df["option_3_img"]==""]
beadaya_df = beadaya_df[beadaya_df["option_4_img"]==""]
beadaya_df = beadaya_df.drop(["question_img", "sub_question_img", "option_0_img", "option_1_img", "option_2_img", "option_3_img", "option_4_img"], axis=1)

# Remove questions with math jax
beadaya_df = beadaya_df[beadaya_df["question_math"]==""]
beadaya_df = beadaya_df[beadaya_df["sub_question_math"]==""]
beadaya_df = beadaya_df[beadaya_df["option_0_math"]==""]
beadaya_df = beadaya_df[beadaya_df["option_1_math"]==""]
beadaya_df = beadaya_df[beadaya_df["option_2_math"]==""]
beadaya_df = beadaya_df[beadaya_df["option_3_math"]==""]
beadaya_df = beadaya_df[beadaya_df["option_4_math"]==""]
beadaya_df = beadaya_df.drop(["question_math", "sub_question_math", "option_0_math", "option_1_math", "option_2_math", "option_3_math", "option_4_math"], axis=1)


# Remove questions with no correct answer
beadaya_df = beadaya_df[beadaya_df["correct_option"]!=-1]

# Remove questions with ordering question (The student should reorder boxes as an answer)
beadaya_df = beadaya_df[~beadaya_df["question_text"].str.contains("رتب ")]
beadaya_df = beadaya_df[~beadaya_df["question_text"].str.contains("رتبي ")]

# Combine questions with sub question
beadaya_df['question_text'] = beadaya_df.apply(lambda row: row['question_text'] + " " + row['sub_question_text'], axis=1)
beadaya_df = beadaya_df.drop(["sub_question_text"], axis=1)

# Remove those with empty answer
beadaya_df = beadaya_df[beadaya_df["question_text"]!=""]
beadaya_df = beadaya_df[beadaya_df["question_text"]!=" "]

# Remove those with 0, 1, and 5 options
subset = beadaya_df[["option_0_text", "option_1_text", "option_2_text", "option_3_text", "option_4_text"]].copy()
subset['n_options'] = subset.apply(lambda row: sum(value != '' for value in row), axis=1)
erronouns_options = subset['n_options'].apply(lambda v: v in [0, 1, 5])
beadaya_df = beadaya_df.loc[~erronouns_options, :]

beadaya_df

In [11]:
questions = []
for i, row in beadaya_df.iterrows():
  question = {}
  question["question"] = row["question_text"].strip()
  question["answer"] = {}
  for i in [0, 1, 2, 3]:
    question["answer"][f"option_{i}"] = row[f"option_{i}_text"]
  question["answer"][f"correct_option"] = row["correct_option"]
  question["grade"] = row["grade"]
  question["subject"] = row["subject"]
  question["resource"] = row["link"]
  questions.append(question)

In [12]:
questions[0]

{'question': 'كان في الماء 8 بطات ، ذهب 6 منهم ، كم بطة بقيت في الماء ؟',
 'answer': {'option_0': '3',
  'option_1': '4',
  'option_2': '2',
  'option_3': '',
  'correct_option': 2},
 'grade': '1',
 'subject': 'math',
 'resource': 'https://beadaya.com/test/15311/'}

In [None]:
json.dump(questions, open("beadaya_qa.json", "w"), ensure_ascii=False)