## Scrape Glosbe:

Glosbe does not let you see all of their sentences, only ones through certain queries. By querying the top 50 words we should be able to get about 91% of sentences, and then remove duplicates.

In [None]:
!pip install deepl

Collecting deepl
  Downloading deepl-1.17.0-py3-none-any.whl (35 kB)
Installing collected packages: deepl
Successfully installed deepl-1.17.0


In [None]:
import os
import glob

from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# Define the path to the files
path_to_files = '/content/drive/MyDrive/formosan_mt_project/translations/amis_videos/*indigenous.txt'

# Define the top 50 words
word_freq_list = [('to', 10711), ('i', 10252), ('ko', 10188), ('a', 8331), ('no', 7030), ('o', 5346), ('sa', 3452), ('haw', 3330), ('ku', 3062), ('han', 2867), ('tu', 2765), ('ako', 2564), ('ira', 2501), ('kako', 2426), ('nu', 2137), ('u', 2132), ('ho', 2074), ('hay', 1577), ('ano', 1516), ('kora', 1483), ('caay', 1423), ('itini', 1379), ('itiya', 1350), ('kiya', 1335), ('nira', 1251), ('kami', 1218), ('mako', 1144), ('kira', 1133), ('niyam', 1108), ('saan', 1058), ('wawa', 1019), ('ci', 1017), ('san', 994), ('saka', 980), ('sato', 972), ('ya', 969), ('awaay', 938), ('sanay', 917), ('ka', 898), ('hananay', 884), ('kita', 827), ('mita', 781), ('ta', 771), ('matoʼasay', 768), ('sowal', 761), ('niyaro', 742), ('aca', 730), ('anini', 674), ('tayra', 670), ('ha', 665)]
top_words = {word for word, freq in word_freq_list}


# Initialize counters
total_lines = 0
lines_with_top_words = 0

# Process each file
for filepath in glob.glob(path_to_files):
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            total_lines += 1
            # Check if any of the top words is in the current line
            if any(word in line.lower().split() for word in top_words):
                lines_with_top_words += 1

# Calculate the overall percentage
if total_lines > 0:  # Check to avoid division by zero
    percentage = (lines_with_top_words / total_lines) * 100
    print(f"Overall, {percentage:.2f}% of lines contain at least one of the top words across all files.")
else:
    print("No lines were found in the files.")


Mounted at /content/drive
Overall, 91.64% of lines contain at least one of the top words across all files.


## Scrape the page

In [None]:
import requests
from bs4 import BeautifulSoup
import os
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# Initialize base URL and top words
base_url = "https://glosbe.com/ami/zh/"

# Paths for the output files
indigenous_path = '/content/drive/MyDrive/formosan_mt_project/translations/amis_glosbe/glosbe-indigenous.txt'
chinese_path = '/content/drive/MyDrive/formosan_mt_project/translations/amis_glosbe/glosbe-chinese.txt'

# Helper function to save sentences to files ensuring parallel corpus structure
def save_sentences(indigenous_sentences, chinese_sentences):
    with open(indigenous_path, 'a', encoding='utf-8') as fi, open(chinese_path, 'a', encoding='utf-8') as fc:
        for ind, chi in zip(indigenous_sentences, chinese_sentences):
            fi.write(ind + '\n')
            fc.write(chi + '\n')

# Function to process each word and maintain parallel structure
def scrape_for_word(word):
    url = base_url + word
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize data structures to track sentences and ensure uniqueness
    seen_pairs = set()
    indigenous_sentences = []
    chinese_sentences = []

    # Internal function to process a page
    def process_page(soup):
        divs = soup.find_all('div', class_='flex')
        for div in divs:
            ami_text = div.find('div', attrs={'lang': 'ami'})
            zh_text = div.find('div', attrs={'lang': 'zh'})
            if ami_text and zh_text:
                ami_sentence = ami_text.text.strip()
                zh_sentence = zh_text.text.strip()
                pair = (ami_sentence, zh_sentence)
                if pair not in seen_pairs:
                    seen_pairs.add(pair)
                    indigenous_sentences.append(ami_sentence)
                    chinese_sentences.append(zh_sentence)

    process_page(soup)

    # Manage "Load More" functionality
    load_more = soup.find('button', attrs={'data-element': 'fragment-loader'})
    while load_more:
        more_url = 'https://glosbe.com' + load_more['data-fragment-url']
        response = requests.get(more_url)
        more_soup = BeautifulSoup(response.text, 'html.parser')
        process_page(more_soup)
        load_more = more_soup.find('button', attrs={'data-element': 'fragment-loader'})

    # Save the sentences ensuring parallel structure
    save_sentences(indigenous_sentences, chinese_sentences)

# Processing each word
for word in top_words:
    scrape_for_word(word)

print("Scraping complete. Files saved.")


Mounted at /content/drive
Scraping complete. Files saved.


In [None]:
# translate
import os
import deepl

from google.colab import drive
drive.mount('/content/drive')

# Replace with the path to your directory containing the files to be translated
directory_path = '/content/drive/MyDrive/formosan_mt_project/translations/amis_glosbe/'

# DO NOT TAKE MY API KEY PLEASE!!! :)
auth_key = "967acf8a-f24c-46d7-bf21-ea631153f8c9:fx"
translator = deepl.Translator(auth_key)

def translate_file(file_path):
    base_name = os.path.basename(file_path)
    file_number = base_name.split('-')[0]  # Assumes the file name format is "number-chinese.txt"
    output_file_name = f"{file_number}-english.txt"
    output_file_path = os.path.join(directory_path, output_file_name)

    # Check if the English translation file already exists
    if os.path.exists(output_file_path):
        print(f"Skipping translation for {file_number}: English file already exists.")
        return

    lines_to_translate = []
    with open(file_path, 'r', encoding='utf-8') as input_file:
        lines_to_translate = [line.strip() for line in input_file.readlines() if line.strip()]  # Skip empty lines

    # DeepL API supports up to 50 texts in one request
    batch_size = 50
    translated_lines = []
    for i in range(0, len(lines_to_translate), batch_size):
        batch = lines_to_translate[i:i+batch_size]
        results = translator.translate_text(batch, source_lang="ZH", target_lang="EN-US")
        translated_lines.extend([result.text for result in results])

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(translated_lines))

for file in os.listdir(directory_path):
    if file.endswith("-chinese.txt"):
        file_path = os.path.join(directory_path, file)
        print(f"Processing {file}...")
        translate_file(file_path)
        print(f"Finished processing {file}.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing glosbe-chinese.txt...
Finished processing glosbe-chinese.txt.
