In [3]:
import glob
import re
import ast
import json
from bs4 import BeautifulSoup

In [4]:
def parse_tags(tag_string):
    """
    The HTML often shows something like ['Finance'] or ['Combinatorics'].
    We'll attempt to parse that as a Python list using ast.literal_eval().
    If it fails, we'll just return the raw string.
    """
    try:
        return ast.literal_eval(tag_string)
    except:
        return [tag_string]

def parse_companies(companies_string):
    """
    Similarly, companies might be something like ['Akuna', 'Citadel'] or [].
    We'll parse that too.
    """
    try:
        return ast.literal_eval(companies_string)
    except:
        return [companies_string]

In [5]:
all_questions = []
question_id = 1  # We'll assign incremental IDs

In [6]:
glob.glob("questions_*.html")

['questions_3.html',
 'questions_2.html',
 'questions_5.html',
 'questions_9.html',
 'questions_8.html',
 'questions_4.html',
 'questions_7.html',
 'questions_6.html',
 'questions_1.html']

In [7]:
# Loop over all files named questions_1.html, questions_2.html, etc.
for filename in glob.glob("questions_*.html"):
    print(f"Parsing file: {filename}")
    with open(filename, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

        # Each question is inside a <div style='max-width: 65%; ...'>
        question_divs = soup.find_all("div", style=re.compile("max-width: 65%;"))
        for div in question_divs:
            # Extract Title
            h2 = div.find("h2")
            title = h2.get_text(strip=True) if h2 else "Untitled"

            # Extract URL (from the <p> that starts with 'URL:')
            url_paragraph = div.find("p", text=re.compile("URL:"))
            # If we can’t find that exact text, we can just look for the <a> link
            a_tag = div.find("a")
            url = a_tag.get("href") if a_tag else ""

            # Extract table info
            # Typically, the table has rows: Topic, Tags, Difficulty, Companies, ...
            table = div.find("table")
            topic = ""
            tags = []
            difficulty = ""
            companies = []
            last_edited_at = ""
            last_edited_by = ""
            internal_difficulty = ""

            if table:
                rows = table.find_all("tr")
                # We'll rely on the row text, e.g. "Topic:", "Tags:", ...
                for row in rows:
                    cols = row.find_all("td")
                    if len(cols) == 2:
                        key = cols[0].get_text(strip=True).lower()
                        val = cols[1].get_text(strip=True)
                        if key == "topic:":
                            topic = val
                        elif key == "tags:":
                            tags = parse_tags(val)
                        elif key == "difficulty:":
                            difficulty = val
                        elif key == "companies:":
                            companies = parse_companies(val)
                        elif key == "last edited at:":
                            last_edited_at = val
                        elif key == "last edited by:":
                            last_edited_by = val
                        elif key == "internal difficulty:":
                            internal_difficulty = val

            # Extract the "Task" text: <h3>Task</h3> followed by <p> (or multiple <p>).
            # In your snippet, there's exactly one <p> for the task (sometimes multiple lines).
            # We'll store it as HTML so we can display it later.
            taskHTML = ""
            task_header = div.find("h3", string=re.compile("Task", re.IGNORECASE))
            if task_header:
                # The next sibling that is a <p> might contain the task
                # or sometimes there's more than one <p> until the next <details>.
                # For simplicity, let's just grab everything until the next <details>.
                task_elems = []
                nxt = task_header.find_next_sibling()
                while nxt and nxt.name != "details":
                    # We'll keep appending paragraphs or text
                    task_elems.append(str(nxt))
                    nxt = nxt.find_next_sibling()
                taskHTML = "\n".join(task_elems)

            # For the <details> blocks: "Hint", "Solution", "Answer"
            # We'll store them as hintHTML, solutionHTML, answerHTML
            hintHTML = ""
            solutionHTML = ""
            answerHTML = ""

            details_tags = div.find_all("details")
            for dtag in details_tags:
                summary = dtag.find("summary")
                if summary:
                    summary_text = summary.get_text(strip=True).lower()
                    # We match by substring
                    if "hint" in summary_text:
                        hintHTML = str(dtag)  # Entire <details> block or just the content
                    elif "solution" in summary_text:
                        solutionHTML = str(dtag)
                    elif "answer" in summary_text:
                        answerHTML = str(dtag)

            # Build a question dict
            question_data = {
                "id": question_id,
                "title": title,
                "url": url,
                "topic": topic.lower(),  # normalize
                "tags": tags,
                "difficulty": difficulty.lower(),
                "companies": companies,
                "lastEditedAt": last_edited_at,
                "lastEditedBy": last_edited_by,
                "internalDifficulty": internal_difficulty,
                "taskHTML": taskHTML.strip(),
                "hintHTML": hintHTML.strip(),
                "solutionHTML": solutionHTML.strip(),
                "answerHTML": answerHTML.strip()
            }
            all_questions.append(question_data)
            question_id += 1

Parsing file: questions_3.html


  url_paragraph = div.find("p", text=re.compile("URL:"))


Parsing file: questions_2.html
Parsing file: questions_5.html
Parsing file: questions_9.html
Parsing file: questions_8.html
Parsing file: questions_4.html
Parsing file: questions_7.html
Parsing file: questions_6.html
Parsing file: questions_1.html


In [9]:
len(all_questions)

1204

In [10]:
# Write out to JSON
with open("questionsData.json", "w", encoding="utf-8") as out:
    json.dump(all_questions, out, indent=2, ensure_ascii=False)

In [11]:
print(f"Done! Extracted {len(all_questions)} questions into questionsData.json.")

Done! Extracted 1204 questions into questionsData.json.


In [11]:
from PIL import Image
import pillow_heif

def resize_image(input_image_path, output_image_path):
    # Open HEIC image using pillow-heif
    if input_image_path.lower().endswith('.heic'):
        heif_image = pillow_heif.open_heif(input_image_path)
        img = Image.frombytes(heif_image.mode, heif_image.size, heif_image.data)
    else:
        img = Image.open(input_image_path)
    
    # Resize the image (downscale by 2x on both axes)
    new_size = (img.width // 2, img.height // 2)
    img_resized = img.resize(new_size)

    # Save as PNG
    img_resized.save(output_image_path, format="PNG")
    print(f"Image saved as {output_image_path}")

In [13]:
# Example usage
input_image_path = "/Users/andranikmovsisyan/Downloads/IMG_4474.heic"  # Replace with the input image name
output_image_path = "/Users/andranikmovsisyan/Downloads/face1.png"  # Replace with the desired output image name
resize_image(input_image_path, output_image_path)

Image saved as /Users/andranikmovsisyan/Downloads/face1.png
