In [2]:
import pysbd
import json
import re
import argparse

In [3]:
def segment_text(text, lang):
    seg = pysbd.Segmenter(language=lang, clean=False)
    return seg.segment(text)

def preprocess_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = []
    for item in data:
        input_text = item['input']
        output_text = item['output']

        input_text = re.sub(r'(?<=\.)(?=[a-z0-9]+\.)', '\n', input_text)
        output_text = re.sub(r'(?<=\.)(?=[a-z0-9]+\.)', '\n', output_text)

        input_sentences = segment_text(input_text, 'zh')
        output_sentences = segment_text(output_text, 'en')

        processed_item = {
            'input': input_sentences,
            'output': output_sentences,
            'id': item['id']
        }
        processed_data.append(processed_item)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=2)

In [4]:
preprocess_data("data/scidb_cn_zh_en.json", "data/processed_data.json")

In [5]:
# embedding.py
import numpy as np
from sentence_transformers import SentenceTransformer
import config

def get_model():
    print(f"Loading model: {config.MODEL_NAME}...")
    return SentenceTransformer(config.MODEL_NAME)

def create_embeddings(model, sentences):
    """
    Input: List of strings
    Output: Numpy array (float32)
    """
    print(f"Encoding {len(sentences)} sentences...")
    # Encode batch
    embeddings = model.encode(
        sentences, 
        batch_size=config.BATCH_SIZE, 
        show_progress_bar=True, 
        convert_to_numpy=True
    )
    return embeddings

def save_binary(embeddings, filepath):
    """
    Lưu numpy array xuống file binary chuẩn float32 cho Vecalign
    """
    # Vecalign bắt buộc input là raw float32
    embeddings.astype('float32').tofile(filepath)
    print(f"Saved binary embeddings to: {filepath}")

In [11]:
# prepare_vecalign.py
import os
import shutil
import config
import preprocess
import embedding

def load_data_mock():
    # Giả lập 3 records để demo
    with open("data\processed_data.json", 'r', encoding='utf-8') as f:
        dat = json.load(f)
    return dat

def main():
    # 1. Setup
    records = load_data_mock() # Thay bằng logic load file thật của bạn
    print(f"Total records to process: {len(records)}")

    # Load Model (Chỉ load 1 lần để tiết kiệm RAM/Time)
    model = embedding.get_model()

    # Xóa folder cũ nếu muốn làm sạch
    if os.path.exists(config.OUTPUT_DIR):
        print(f"Cleaning old output dir: {config.OUTPUT_DIR}")
        shutil.rmtree(config.OUTPUT_DIR)
    
    # 2. Loop through each record
    for rec in records:
        rec_id = rec['id']
        print(f"\nProcessing Record: {rec_id}...")
        
        # Tạo folder riêng cho record này: ./vecalign_input/REC_001
        rec_dir = os.path.join(config.OUTPUT_DIR, str(rec_id))
        os.makedirs(rec_dir, exist_ok=True)
        
        # --- A. Preprocess ---
        src_sents = rec['input']
        tgt_sents = rec['output']
        
        # Kiểm tra nếu rỗng thì bỏ qua
        if not src_sents or not tgt_sents:
            print(f"Skipping {rec_id} due to empty content.")
            continue

        # --- B. Save Text Files (src.txt, tgt.txt) ---
        # Lưu ngay vào folder của record đó
        src_txt_path = os.path.join(rec_dir, "src.txt")
        tgt_txt_path = os.path.join(rec_dir, "tgt.txt")
        
        with open(src_txt_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(src_sents))
            
        with open(tgt_txt_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(tgt_sents))

        !./vecalign/overlap.py -i {src_txt_path} 

        # --- C. Generate & Save Embeddings ---
        # Lưu file binary ngay vào folder đó
        src_emb_path = os.path.join(rec_dir, "src.emb")
        tgt_emb_path = os.path.join(rec_dir, "tgt.emb") 
        
        src_emb = embedding.create_embeddings(model, src_sents)
        tgt_emb = embedding.create_embeddings(model, tgt_sents)
        
        embedding.save_binary(src_emb, src_emb_path)
        embedding.save_binary(tgt_emb, tgt_emb_path)
        
        print(f"-> Saved data to: {rec_dir}")

    print("\n[DONE] All records prepared individually.")
    print(f"Check folder '{config.OUTPUT_DIR}' to see sub-folders.")

In [12]:
main()

Total records to process: 10
Loading model: sentence-transformers/LaBSE...
Cleaning old output dir: ./vecalign_input

Processing Record: 1...
Encoding 5 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 3 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\1\src.emb
Saved binary embeddings to: ./vecalign_input\1\tgt.emb
-> Saved data to: ./vecalign_input\1

Processing Record: 2...
Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 3 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\2\src.emb
Saved binary embeddings to: ./vecalign_input\2\tgt.emb
-> Saved data to: ./vecalign_input\2

Processing Record: 3...
Encoding 3 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 3 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\3\src.emb
Saved binary embeddings to: ./vecalign_input\3\tgt.emb
-> Saved data to: ./vecalign_input\3

Processing Record: 4...
Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 3 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\4\src.emb
Saved binary embeddings to: ./vecalign_input\4\tgt.emb
-> Saved data to: ./vecalign_input\4

Processing Record: 5...
Encoding 6 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 3 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\5\src.emb
Saved binary embeddings to: ./vecalign_input\5\tgt.emb
-> Saved data to: ./vecalign_input\5

Processing Record: 6...
Encoding 10 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 6 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\6\src.emb
Saved binary embeddings to: ./vecalign_input\6\tgt.emb
-> Saved data to: ./vecalign_input\6

Processing Record: 7...
Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\7\src.emb
Saved binary embeddings to: ./vecalign_input\7\tgt.emb
-> Saved data to: ./vecalign_input\7

Processing Record: 8...
Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\8\src.emb
Saved binary embeddings to: ./vecalign_input\8\tgt.emb
-> Saved data to: ./vecalign_input\8

Processing Record: 9...
Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\9\src.emb
Saved binary embeddings to: ./vecalign_input\9\tgt.emb
-> Saved data to: ./vecalign_input\9

Processing Record: 10...
Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding 4 sentences...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved binary embeddings to: ./vecalign_input\10\src.emb
Saved binary embeddings to: ./vecalign_input\10\tgt.emb
-> Saved data to: ./vecalign_input\10

[DONE] All records prepared individually.
Check folder './vecalign_input' to see sub-folders.
