In [None]:

!pip install requests beautifulsoup4
!pip install openai



In [133]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Function to load html from url

In [136]:
def get_html_content(url):

  try:
      response = requests.get(url)
      response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

      soup = BeautifulSoup(response.content, "html.parser")

      # Find the element with id "QuestionList"
      question_list = soup.find(id="QuestionList")

      if question_list:
          questions = question_list.find_all("div", class_="single-question mb-3")
          return questions
      else:
          print("Element with id 'QuestionList' not found.")

  except requests.exceptions.RequestException as e:
      print(f"An error occurred: {e}")
  except Exception as e:
      print(f"An unexpected error occurred: {e}")

In [None]:
def number_to_ordinal(n):
  """Converts a number to its ordinal form (e.g., 1 to 1st, 2 to 2nd)."""
  suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
  return str(n) + suffix

# Example usage:
for i in range(11, 43):
  print(number_to_ordinal(i))

## Collect all the questions

In [166]:
base_url = "https://uttoron.academy/QuestionBank/QuestionPreview/"

questions_html = []
for i in range(10, 43):
  url = base_url + str(number_to_ordinal(i)) + "-BCS"
  print(url)
  year_questions = get_html_content(url)
  questions_html.extend(year_questions)

print(f"Total number of questions: {len(questions_html)}")


https://uttoron.academy/QuestionBank/QuestionPreview/10th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/11st-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/12nd-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/13rd-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/14th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/15th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/16th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/17th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/18th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/19th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/20th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/21st-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/22nd-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/23rd-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/24th-BCS
https://uttoron.academy/QuestionBank/QuestionPreview/25th-BCS
https://

## Function to extract different information from html

In [151]:
def extract_quests(html_content):
  description = question.find("p", class_="description").text.strip()
  data_answer = question.get("data-answer")
  data_subject = question.get("data-subject")
  extracted = {
        'Question': description,
        'Subject': data_subject,
        'Answer': data_answer.capitalize()
  }

  options_list = []
  lis = question.find_all('li')

  for li in question.find_all('li'):
      option_text = li.find_all('span')[-1].text.strip()
      options_list.append(option_text)

  for i, option in enumerate(options_list, start=1):
        extracted['A'] = options_list[0]
        extracted['B'] = options_list[1]
        extracted['C'] = options_list[2]
        extracted['D'] = options_list[3]

  return extracted

## Extract all the questions and store in csv file

In [167]:

quests = []
try:
  for question in questions_html:
    quests.append(extract_quests(question))
except NameError:
  print("'questions' variable not found. Please make sure the previous code block has executed successfully.")

df = pd.DataFrame(quests)
df.to_csv('questions_answers_10-42_BCS.csv', index=False, encoding='utf-8')
df
# print('\n'.join(map(str, quests[:5])))


Unnamed: 0,Question,Subject,Answer,A,B,C,D
0,‘আনারস’ এবং ‘চাবি’ শব্দ দুটি বাংলা ভাষা গ্রহণ ...,BANGLA-LANGUAGE,A,পর্তুগিজ ভাষা হতে,আরবি ভাষা হতে,দেশী ভাষা হতে,ওলন্দাজ ভাষা হতে
1,শুদ্ধ বানান কোনটি?,BANGLA-LANGUAGE,B,মূমুর্ষু,মুমূর্ষু,মূমূর্ষ,মুমূর্ষ
2,গুরুচণ্ডালী দোষমুক্ত কোনটি?,BANGLA-LANGUAGE,C,শবপোড়া,মড়াদাহ,শবদাহ,শবমড়া
3,‘কবর’ নাটকটির লেখক-,BANGLA-LITERATURE,C,জসীমউদ্দীন,নজরুল ইসলাম,মুনীর চৌধুরী,দ্বিজেন্দ্রলাল রায়
4,‘উভয়কূল রক্ষা’ অর্থে ব্যবহৃত প্রবচন কোনটি?,BANGLA-LANGUAGE,C,"কারো পৌষ মাস, কারো সর্বনাস","চাল না চুলো, ঢেঁকী না কুলো","সাপও মরে, লাঠিও না ভাঙ্গে","বোঝার উপর, শাকের আঁটি"
...,...,...,...,...,...,...,...
3945,Liza had given me two:,ENGLISH-LANGUAGE,C,pair of jean,pairs of jean,pair of jeans,pairs of jeans
3946,গণপ্রজাতন্ত্রী বাংলাদেশের সংবিধানের খসড়া সর্বপ...,BANGLADESH-AFFAIRS,B,১১ নভেম্বর,১২ অক্টোবর,১৬ ডিসেম্বর,৩ মার্চ
3947,অধিত্যকা এর বিপরীতার্থক শব্দ কোনটি?,BANGLA-LANGUAGE,A,উপত্যকা,ধিত্যকা,পার্বত্য,সমতল
3948,নিচের প্রশ্নবোধক স্থানে কোনটি বসবে? \({\mathrm...,MENTAL-ABILITY,D,\(54\over \mathrm N\),\(\mathrm T \over 19\),\(\mathrm L \over 52\),\(\mathrm V \over 36\)


## Some miscellenious data checking

In [120]:
try:
    # Find rows where the 'Answer' column is empty or contains only whitespace
    empty_answer_rows = df[df['Answer'].str.strip() == '']

    if not empty_answer_rows.empty:
        print("Rows with empty answers:")
        print(empty_answer_rows)
    else:
        print("No rows with empty answers found.")

except KeyError:
    print("The 'Answer' column does not exist in the DataFrame.")
except AttributeError:
    print("The 'Answer' column is not of a string type.")


Rows with empty answers:
                                            Question                 Subject  \
8  ১ হতে বড় ১০০০ এর মধ্যে কতগুলো সংখ্যা আছে যারা ...  MATHEMATICAL-REASONING   

  Answer       A       B       C       D  
8         \(33\)  \(35\)  \(37\)  \(41\)  


In [62]:
from openai import OpenAI
client = OpenAI(api_key="<<YOUR SECRET>>", base_url="https://api.deepseek.com")


## Testing Round:



*   Creating instance of DeepSeek
*   Calling api to solve every question
*   Comparing answers



In [78]:
def extract_content_by_deepseek(html):

  prompt = """
  ${content}

  Extract question and answer options from this html. Please return as string format so I can store in csv file. Do not translate and return as it is.

  Response structure in two lines:
  Extracted Question Here
  Extracted Options here with seperated by ###
  """

  html_string = str(html)
  prompt_with_html = prompt.replace("${content}", html_string)

  response = client.chat.completions.create(
      model="deepseek-chat",
      messages=[
          {"role": "system", "content": "You are a world class html parser."},
          {"role": "user", "content": prompt_with_html},
      ],
      stream=False
  )

  extracted_content = response.choices[0].message.content
  question_options = extracted_content.split('\n')

  try:
    options = question_options[1].split("###")
    extracted = {
      'Question': question_options[0],
      'A': options[0],
      'B': options[1],
      'C': options[2],
      'D': options[3]
    }
    return extracted
  except:
    print("No options found")