In [10]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = 'https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages'

# Send a GET request to the webpage
response = requests.get(url)
response.raise_for_status()  # will raise an exception for bad requests

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table by assuming some identifiable attribute or structure
# This is a hypothetical example; you will need to inspect the actual HTML structure
# Here, I am assuming the table has a specific class 'data-table' which is common
# You will need to adjust this based on actual webpage content
# Function to find the correct table based on header titles
def find_correct_table(soup):
    tables = soup.find_all('table')
    for table in tables:
        headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
        # Check for specific headers, assuming headers are in a specific order or present
        if 'location' in headers and 'name' in headers and 'model' in headers:
            return table
    return None

# Use the function to get the correct table
table = find_correct_table(soup)

# Dictionary to store BCP-47 codes and corresponding Names
language_dictionary = {}

if table:
    rows = table.find('tbody', class_='list').find_all('tr') if table.find('tbody', class_='list') else []
    # Loop through each row in the table
    for row in rows:
        # Find all columns in this row
        columns = row.find_all('td')
        if columns:
            model_td = columns[3].find('code')
            if model_td and 'chirp' in model_td.get_text(strip=True).lower():
                # Extract BCP-47 code and Name
                bcp_47_code = columns[2].get_text(strip=True)
                name = columns[1].get_text(strip=True)
                language_dictionary[bcp_47_code] = name

# Output the dictionary
print(language_dictionary)

{'af-ZA': 'Afrikaans (South Africa)', 'sq-AL': 'Albanian (Albania)', 'am-ET': 'Amharic (Ethiopia)', 'ar-EG': 'Arabic (Egypt)', 'hy-AM': 'Armenian (Armenia)', 'as-IN': 'Assamese (India)', 'ast-ES': 'Asturian (Spain)', 'az-AZ': 'Azerbaijani (Azerbaijan)', 'eu-ES': 'Basque (Spain)', 'be-BY': 'Belarusian (Belarus)', 'bs-BA': 'Bosnian (Bosnia and Herzegovina)', 'bg-BG': 'Bulgarian (Bulgaria)', 'my-MM': 'Burmese (Myanmar)', 'ca-ES': 'Catalan (Spain)', 'ceb-PH': 'Cebuano (Philippines)', 'ckb-IQ': 'Central Kurdish (Iraq)', 'yue-Hant-HK': 'Chinese, Cantonese (Traditional Hong Kong)', 'zh-TW (cmn-Hant-TW)': 'Chinese, Mandarin (Traditional, Taiwan)', 'hr-HR': 'Croatian (Croatia)', 'cs-CZ': 'Czech (Czech Republic)', 'da-DK': 'Danish (Denmark)', 'nl-NL': 'Dutch (Netherlands)', 'en-AU': 'English (Australia)', 'en-IN': 'English (India)', 'en-GB': 'English (United Kingdom)', 'en-US': 'English (United States)', 'et-EE': 'Estonian (Estonia)', 'fil-PH': 'Filipino (Philippines)', 'fi-FI': 'Finnish (Finlan

In [11]:
import json
# File path where the JSON will be saved
json_file_path = 'language_data.json'

# Save the dictionary to a JSON file
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(language_dictionary, json_file, ensure_ascii=False, indent=4)

print(f'Dictionary saved to JSON file {json_file_path}')

Dictionary saved to JSON file language_data.json
