In [8]:
import pysbd
import json
import re
import argparse

In [9]:
def segment_text(text, lang):
    seg = pysbd.Segmenter(language=lang, clean=False)
    return seg.segment(text)

def preprocess_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = []
    for item in data:
        input_text = item['input']
        output_text = item['output']

        input_text = re.sub(r'(?<=\.)(?=[a-z0-9]+\.)', '\n', input_text)
        output_text = re.sub(r'(?<=\.)(?=[a-z0-9]+\.)', '\n', output_text)

        input_sentences = segment_text(input_text, 'zh')
        output_sentences = segment_text(output_text, 'en')

        processed_item = {
            'input': input_sentences,
            'output': output_sentences,
            'id': item['id']
        }
        processed_data.append(processed_item)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=2)

In [10]:
preprocess_data("data/scidb_cn_zh_en.json", "data/processed_data.json")

In [11]:
# embedding.py
import numpy as np
from sentence_transformers import SentenceTransformer
import config

def get_model():
    print(f"Loading model: {config.MODEL_NAME}...")
    return SentenceTransformer(config.MODEL_NAME)

def create_embeddings(model, sentences):
    """
    Input: List of strings
    Output: Numpy array (float32)
    """
    print(f"Encoding {len(sentences)} sentences...")
    # Encode batch
    embeddings = model.encode(
        sentences, 
        batch_size=config.BATCH_SIZE, 
        show_progress_bar=True, 
        convert_to_numpy=True
    )
    return embeddings

def save_binary(embeddings, filepath):
    """
    Lưu numpy array xuống file binary chuẩn float32 cho Vecalign
    """
    # Vecalign bắt buộc input là raw float32
    embeddings.astype('float32').tofile(filepath)
    print(f"Saved binary embeddings to: {filepath}")

In [14]:
# prepare_vecalign.py
import json
import config
import preprocess
import embedding

def load_data():
    # TODO: Thay hàm này bằng code đọc file json thật của bạn
    with open("data\processed_data.json", 'r', encoding='utf-8') as f:
        dat = json.load(f)
    return dat

# 1. Load Data
records = load_data() # Thay bằng hàm load file thật
print(f"Total records to process: {len(records)}")

# Lists tổng chứa toàn bộ corpus đã stitch
all_src_sents = []
all_tgt_sents = []

# List lưu map để sau này biết câu thứ i thuộc record nào
# Format: (sentence_index, record_id, is_barrier)
src_map = [] 

# 2. Preprocess & Stitching (Khâu dữ liệu & Chèn Rào chắn)
print("--- Preprocessing & Segmenting ---")

for rec in records:
    rec_id = rec['id']
    
    # Tách câu
    src_seg = preprocess.segment_text(rec['zh_text'], lang='zh')
    tgt_seg = preprocess.segment_text(rec['en_text'], lang='en')
    
    # Add vào list tổng
    for s in src_seg:
        all_src_sents.append(s)
        src_map.append(f"{rec_id}") # Lưu ID
        
    for s in tgt_seg:
        all_tgt_sents.append(s)
        
    # --- CRITICAL: CHÈN BARRIER ---
    # Chèn vào cả 2 bên để vecalign neo lại tại đây
    all_src_sents.append(config.BARRIER_TOKEN)
    all_tgt_sents.append(config.BARRIER_TOKEN)
    src_map.append("BARRIER") # Đánh dấu đây là rào chắn

# 3. Export Text Files (Vecalign cần text để tham chiếu dòng)
print("--- Saving Text Files ---")
with open(config.SRC_TEXT_FILE, 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_src_sents))
    
with open(config.TGT_TEXT_FILE, 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_tgt_sents))
    
with open(config.MAP_FILE, 'w', encoding='utf-8') as f:
    f.write('\n'.join(src_map))

# 4. Generate Embeddings
print("--- Generating Embeddings ---")
model = embedding.get_model()

src_emb = embedding.create_embeddings(model, all_src_sents)
tgt_emb = embedding.create_embeddings(model, all_tgt_sents)

# 5. Save Binary Files
embedding.save_binary(src_emb, config.SRC_EMB_FILE)
embedding.save_binary(tgt_emb, config.TGT_EMB_FILE)

print("\n[DONE] Data prepared successfully!")
print(f"Dimensions: {src_emb.shape}")
print("You can now run 'sh run_vecalign.sh'")


Total records to process: 10
--- Preprocessing & Segmenting ---


AttributeError: module 'preprocess' has no attribute 'segment_text'