In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# Path to the folder containing all .sgml files
folder_path = 'EVBCorpus_EVBNews_v2.0/'

# Initialize list to collect all pairs
all_pairs = []

# Loop through all .sgml files
for filename in tqdm(os.listdir(folder_path), desc='Processing files'):
    if filename.endswith('.sgml'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        soup = BeautifulSoup(content, 'xml')

        # Extract English-Vietnamese pairs from each file
        for spair in soup.find_all('spair'):
            en = spair.find('s', {'id': lambda x: x and x.startswith('en')})
            vn = spair.find('s', {'id': lambda x: x and x.startswith('vn')})
            if en and vn:
                all_pairs.append({
                    'source': en.text.strip(),
                    'target': vn.text.strip()
                })

# Convert all pairs to DataFrame
df = pd.DataFrame(all_pairs)

# Add T5-style translation prefix
df['source'] = 'translate English to Vietnamese: ' + df['source']

# Export to CSV ready for T5 fine-tuning
output_csv_path = 't5_training_data_full.csv'
df.to_csv(output_csv_path, index=False, encoding='utf-8')

print(f"✅ Dataset saved to {output_csv_path} with {len(df)} sentence pairs.")


Processing files: 100%|██████████| 1000/1000 [00:37<00:00, 26.83it/s]


✅ Dataset saved to t5_training_data_full.csv with 45308 sentence pairs.
