In [1]:
import requests
from requests.exceptions import ConnectionError, Timeout
from time import sleep
import re
from bs4 import BeautifulSoup


url = 'https://www.oxfordlearnersdictionaries.com/wordlists/oxford-phrase-list'
cookies = {
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def clean_filename(filename):
    filename = filename.replace(' ', '-')
    return re.sub(r'[\\/*?:"<>|]', "", filename)


def fetch_url(url, cookies, headers, retries=3, backoff_factor=0.3):
    for i in range(retries):
        try:
            response = requests.get(url, cookies=cookies, headers=headers, timeout=60)
            if response.status_code == 200:
                print("Request was successful")
                return response
            else:
                print(f"Request failed with status code: {response.status_code}")
                return None
        except (ConnectionError, Timeout) as e:
            print(f"Attempt {i+1} failed: {e}")
            sleep(backoff_factor * (2 ** i))  # Exponential backoff
    print("All attempts to connect failed.")
    return None

response = fetch_url(url, cookies, headers)

if response:
    soup = BeautifulSoup(response.content, 'html.parser')
    

    pages = soup.find('ul', class_='top-g')

    dictionaries = []

    li = pages.find_all('li')
    
    def download_audio(url, path, headers, retries=5, backoff_factor=0.3):
        attempt = 0
        while attempt < retries:
            try:
                response = requests.get(url, headers=headers, timeout=10)
                if response.status_code == 200:
                    with open(path, 'wb') as f:
                        f.write(response.content)
                    return True
                else:
                    print(f"Request failed with status code: {response.status_code}")
                    return False
            except (requests.ConnectionError, requests.Timeout) as e:
                attempt += 1
                print(f"Attempt {attempt} failed: {e}")
                sleep(backoff_factor * (2 ** attempt))  # Exponential backoff
        print("All attempts to connect failed.")
        return False

    import os 
    import json
    
    os.makedirs('audio/uk', exist_ok=True)
    os.makedirs('audio/us', exist_ok=True)

    for i in range(len(li)):
        vocabulary = li[i].find('a').text
        lexical_categories = li[i].find('span').text
        belong_to = li[i].find('span', class_='belong-to')
        audio = li[i].find_all('div', class_='audio_play_button')

        audio_uk = audio[0]['data-src-mp3'] if len(audio) > 1 else None
        audio_us = audio[1]['data-src-mp3'] if len(audio) > 1 else None

        clean_vocabulary = clean_filename(vocabulary)

        if audio_uk:
            audio_uk_url = "https://www.oxfordlearnersdictionaries.com" + audio_uk
            audio_uk_path = f'audio/uk/oxford-uk-{clean_vocabulary}.mp3'
            download_audio(audio_uk_url, audio_uk_path, headers)
        if audio_us:
            audio_us_url = "https://www.oxfordlearnersdictionaries.com" + audio_us
            audio_us_path = f'audio/us/oxford-us-{clean_vocabulary}.mp3'
            download_audio(audio_us_url, audio_us_path, headers)

        dictionaries.append({
            'order': i + 1,
            'vocabulary': vocabulary,
            'href': "https://www.oxfordlearnersdictionaries.com" + li[i].find('a')['href'],
            'lexical_categories': lexical_categories,
            'belong_to': belong_to.text if belong_to else None,
            'audio_uk': audio_uk,
            'audio_us': audio_us,
            "audio_uk_path": 'audio/uk/oxford-uk-' + clean_vocabulary + '.mp3',
            "audio_us_path": 'audio/us/oxford-us-' + clean_vocabulary + '.mp3',
        })
    
    
    

Request was successful
Attempt 1 failed: HTTPSConnectionPool(host='www.oxfordlearnersdictionaries.com', port=443): Max retries exceeded with url: /media/english/uk_pron/a/a_f/a_few/a_few_1_gb_1.mp3 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000284FD751590>, 'Connection to www.oxfordlearnersdictionaries.com timed out. (connect timeout=10)'))
Attempt 1 failed: HTTPSConnectionPool(host='www.oxfordlearnersdictionaries.com', port=443): Max retries exceeded with url: /media/english/uk_pron/a/as_/as_fa/as_far_as_i_am_concerned_1_gb_1.mp3 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000284FD751090>, 'Connection to www.oxfordlearnersdictionaries.com timed out. (connect timeout=10)'))
Attempt 1 failed: HTTPSConnectionPool(host='www.oxfordlearnersdictionaries.com', port=443): Max retries exceeded with url: /media/english/uk_pron/d/dep/depen/depending_on_1_gb_1.mp3 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConn

In [2]:
with open('oxford_dictionary.json', 'w', encoding='utf-8') as json_file:
    json.dump(dictionaries, json_file, ensure_ascii=False, indent=4)