In [1]:
import json
import requests
from requests.exceptions import ConnectionError, Timeout
from time import sleep
import re
from bs4 import BeautifulSoup

In [2]:
file_path = 'updated_oxford_dictionary.json'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [3]:
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


In [4]:
def fetch_url(url, cookies, headers, retries=3, backoff_factor=0.3):
    for i in range(retries):
        try:
            response = requests.get(url, cookies=cookies, headers=headers, timeout=60)
            if response.status_code == 200:
                print("Request was successful")
                return response
            else:
                print(f"Request failed with status code: {response.status_code}")
                return None
        except (ConnectionError, Timeout) as e:
            print(f"Attempt {i+1} failed: {e}")
            sleep(backoff_factor * (2 ** i))  # Exponential backoff
    print("All attempts to connect failed.")
    return None

In [5]:
for index in range(len(data)):
    url = data[index]['href']
    print(f"Fetching {url}")
    response = fetch_url(url, None, headers)
    if response is None:
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get all content of the word
    entryContent = soup.find('div', class_="oald")
    if not entryContent:
        print(f"Entry content not found for {url}")
        continue
    
    # Get level of the word
    symbols = entryContent.find('div', class_="symbols")
    if symbols:
        level_a_tag = symbols.find('a')
        level_href = level_a_tag['href']
        level = re.search(r'level=(\w+)', level_href).group(1)
        data[index]['level'] = level

    # Get pronunciation of the word
    phonetics = entryContent.find('span', class_="phonetics")
    phons_br = phonetics.find('div', class_="phons_br")
    phons_n_am = phonetics.find('div', class_="phons_n_am")
    
    if phons_br:
        phon = phons_br.find('span')
        if phon is not None:
            phon = phon.text
            data[index]['phons_br'] = phon        
            print("phon_br: ", phon)
    if phons_n_am:
        phon = phons_n_am.find('span')
        if phon is not None:
            phon = phon.text
            data[index]['phons_n_am'] = phon
            print("phon_n_am: ", phon)

    # Get topic of the word
    topic_name = entryContent.find('span', class_="topic_name")
    topic_cefr = entryContent.find('span', class_="topic_cefr")
    
    if topic_name:
        topic_name = topic_name.text
        data[index]['topic_name'] = topic_name

    if topic_cefr:
        topic_cefr = topic_cefr.text
        data[index]['topic_cefr'] = topic_cefr

    un = entryContent.find('span', class_="un")
    if un:
        un = un.text
        data[index]['reminder'] = un

    # Get means of the word
    senses_html = entryContent.find('ol', class_="senses_multiple")
    senses_single = entryContent.find('ol', class_="sense_single")

    if not senses_html:
        senses_html = senses_single
    if not senses_html:
        print(f"No senses found for {url}")
        continue

    senses = senses_html.find_all('li', class_="sense")
    if not senses:
        print(f"No senses found for {url}")
        continue
    
    means = []

    for idx_sense in range(len(senses)):
        # Get level of the sense
        symbols = senses[idx_sense].find('div', class_="symbols")

        example_level = None
        synonym = None

        if symbols:
            level_a_tag = symbols.find('a')
            if level_a_tag:
                level_href = level_a_tag['href']
                level = re.search(r'level=(\w+)', level_href).group(1)
                example_level = level
        
        sense_level = senses[idx_sense].find('span', class_="level")
        definition = senses[idx_sense].find('span', class_="def")
        xrefs = senses[idx_sense].find('span', class_="xrefs")

        if definition is not None:
            definition = definition.text
        else:
            print(f"Definition not found for {url}")
            break

        if xrefs is not None:
            xh = xrefs.find('span', class_="xh")
            prefix = xrefs.find('span', class_="prefix")
            if xh and prefix:
                xh = xh.text
                prefix = prefix.text
                synonym = prefix + ": " + xh
            
        examples = senses[idx_sense].find('ul', class_="examples")
        if examples is not None:
            example = examples.find_all('li')
            example = [ex.text for ex in example]
            
            means.append({
                "order_sense": idx_sense + 1,
                "definition": definition,
                "example": example,
                "example_level": example_level if example_level else None,
                "synonym": synonym if synonym else None
            })
        
    data[index]['means'] = means
    print("means: ", means)
new_file_path = 'oxford_dictionary_with_means_v2.json'

with open(new_file_path, 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4, ensure_ascii=False)
    

print("Done")
        
        
        


Fetching https://www.oxfordlearnersdictionaries.com/definition/english/a_1
Request was successful
phon_br:  /ə/
phon_n_am:  /ə/
means:  [{'order_sense': 1, 'definition': 'used before countable or singular nouns referring to people or things that have not already been mentioned', 'example': ['a man/horse/unit', 'an aunt/egg/hour/X-ray', 'I can only carry two at a time.', "There's a visitor for you.", "She's a friend of my father's (= one of my father's friends)."], 'example_level': 'a1', 'synonym': None}, {'order_sense': 2, 'definition': 'used to show that somebody/something is a member of a group or profession', 'example': ["Their new car's a BMW.", "She's a Buddhist.", "He's a teacher.", 'Is that a Monet (= a painting by Monet)?'], 'example_level': 'a1', 'synonym': None}, {'order_sense': 3, 'definition': 'any; every', 'example': ['A lion is a dangerous animal.'], 'example_level': 'a1', 'synonym': None}, {'order_sense': 4, 'definition': 'used before uncountable nouns when these have an