In [107]:
import json
import requests
from requests.exceptions import ConnectionError, Timeout
from time import sleep
import re
from bs4 import BeautifulSoup

In [108]:
file_path = 'oxford_dictionary.json'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [109]:
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)


In [110]:
def fetch_url(url, cookies, headers, retries=3, backoff_factor=0.3):
    for i in range(retries):
        try:
            response = requests.get(url, cookies=cookies, headers=headers, timeout=60)
            if response.status_code == 200:
                print("Request was successful")
                return response
            else:
                print(f"Request failed with status code: {response.status_code}")
                return None
        except (ConnectionError, Timeout) as e:
            print(f"Attempt {i+1} failed: {e}")
            sleep(backoff_factor * (2 ** i))  # Exponential backoff
    print("All attempts to connect failed.")
    return None

In [111]:
for index in range(len(data)):
    url = data[index]['href']
    print(f"Fetching {url}")
    response = fetch_url(url, None, headers)
    if response is None:
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    
    senses_html = soup.find('ol', class_="senses_multiple")
    senses_single = soup.find('ol', class_="sense_single")

    if not senses_html:
        senses_html = senses_single
    if not senses_html:
        print(f"No senses found for {url}")
        continue

    senses = senses_html.find_all('li', class_="sense")
    if not senses:
        print(f"No senses found for {url}")
        continue
    
    means = []

    for idx_sense in range(len(senses)):
        definition = senses[idx_sense].find('span', class_="def")
        if definition is not None:
            definition = definition.text
        else:
            print(f"Definition not found for {url}")
            break
            
        examples = senses[idx_sense].find('ul', class_="examples")
        if examples is not None:
            example = examples.find('li').text
            means.append({
                "order_sense": idx_sense + 1,
                "definition": definition,
                "example": example
            })
        
    data[index]['means'] = means

new_file_path = 'oxford_dictionary_with_means.json'

with open(new_file_path, 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4, ensure_ascii=False)
    

print("Done")
        
        
        


Fetching https://www.oxfordlearnersdictionaries.com/definition/english/a_1
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/abandon_1
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/ability_1
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/able_1
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/abolish
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/abortion
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/about_2
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/about_1
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/above_2
Request was successful
Fetching https://www.oxfordlearnersdictionaries.com/definition/english/above_1
Req