In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Create the XML
import os
import xml.etree.ElementTree as ET
import xml.dom.minidom
from collections import defaultdict

def group_files_by_prefix(files):
    grouped_files = defaultdict(list)
    for file in files:
        prefix = file.split('-')[0]
        grouped_files[prefix].append(file)
    return grouped_files

def read_translations(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().strip().split('\n')

def generate_xml_for_group(files, base_dir, output_dir):
    languages = {'txt': 'amis', 'zh': 'zhinese', 'en': 'english'}
    sentences = {}

    # Read files and sort content by language
    for file in files:
        extension = file.split('.')[-1]
        language = languages[extension]
        sentences[language] = read_translations(os.path.join(base_dir, file))

    video_id = files[0].split('-')[0]  # Assumes video ID is the prefix before the first dash in filename
    source_url = f"https://ailt.ilrdf.org.tw/colloquial/{video_id}"

    # Create XML structure
    root = ET.Element("TEXT", {"xml:lang": "amis", "source": source_url})
    num_sentences = len(sentences[next(iter(sentences))])  # Assuming all files have the same number of lines

    for i in range(num_sentences):
        s_element = ET.SubElement(root, "S", id=str(i))
        for lang, sents in sentences.items():
            if lang == 'amis':
                form_element = ET.SubElement(s_element, "FORM")
                form_element.text = sents[i]
            else:
                transl_element = ET.SubElement(s_element, "TRANSL", {"xml:lang": lang[:2]})
                transl_element.text = sents[i]

    # Convert to pretty XML
    xml_str = xml.dom.minidom.parseString(ET.tostring(root)).toprettyxml(indent="    ")

    # Save XML to file
    output_path = os.path.join(output_dir, f"{video_id}.xml")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(xml_str)

# Example usage
base_dir = '/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos'
output_dir = '/content/drive/MyDrive/formosan_mt_project/xml/amis_videos'
files = os.listdir(base_dir)
grouped_files = group_files_by_prefix(files)

for prefix, group in grouped_files.items():
    generate_xml_for_group(group, base_dir, output_dir)


In [None]:
import os

def count_english_words_in_files(directory):
    total_words = 0

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.en'):
                # Construct the full path of the file
                path = os.path.join(root, file)
                try:
                    # Open the file and read the contents
                    with open(path, 'r', encoding='utf-8') as f:
                        words = f.read().split()
                        num_words = len(words)
                        total_words += num_words
                        print(f"{path}: {num_words} words")
                except Exception as e:
                    print(f"Failed to read {file}: {e}")

    # Print the total number of words in all .en files in the directory and subdirectories
    print(f"Total words in directory and subdirectories: {total_words}")

# Usage example
directory = '/content/drive/MyDrive/formosan_mt_project/translations/amis'
count_english_words_in_files(directory)


/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_bible/1-bible-english.en: 23824 words
/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos/719-video-english.en: 1846 words
/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos/57-video-english.en: 965 words
/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos/759-video-english.en: 1520 words
/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos/758-video-english.en: 1668 words
/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos/744-video-english.en: 1626 words
/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos/741-video-english.en: 1861 words
/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos/25-video-english.en: 286 words
/content/drive/MyDrive/formosan_mt_project/translations/amis/amis_videos/731-video-english.en: 1115 words
/content/drive/MyDrive/formosan_mt_project/translati