In [1]:
import os

# === Recursively get all CPG files in a directory ===
def get_cpg_files_from_directory(root_dir):
    cpg_files = []
    for root, _, files in os.walk(root_dir):
        for file in files:
            cpg_files.append(os.path.join(root, file))
    return cpg_files

# === Extract sentences from CPG files ===
def extract_cpg_sentences(cpg_paths):
    all_sentences = []
    for cpg_path in cpg_paths:
        sentences = []
        sentence_tokens = []
        inside_sentence = False

        with open(cpg_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith("<Sentence id="):
                    inside_sentence = True
                    sentence_tokens = []
                elif line.startswith("</Sentence>"):
                    if sentence_tokens:
                        clean_tokens = [w for w in sentence_tokens if w not in {'(', ')', '((', '))'}]
                        full_sentence = ' '.join(clean_tokens)
                        sentences.append(full_sentence)
                    inside_sentence = False
                    sentence_tokens = []
                elif inside_sentence and line and not line.startswith("<"):
                    parts = line.split('\t')
                    if len(parts) > 1:
                        token = parts[1]
                        if token not in {'(', ')', '((', '))'}:
                            sentence_tokens.append(token)

        all_sentences.extend(sentences)
    return all_sentences

# === Parse CPG sentence blocks ===
def parse_cpg_blocks(cpg_files):
    all_blocks = []
    for cpg_path in cpg_files:
        blocks = []
        with open(cpg_path, 'r', encoding='utf-8') as f:
            block = []
            for line in f:
                if "<Sentence id=" in line:
                    block = [line.strip()]
                elif "</Sentence>" in line:
                    block.append(line.strip())
                    blocks.append(block)
                elif block:
                    block.append(line.strip())
        all_blocks.extend(blocks)
    return all_blocks

# === Parse UD blocks from .conllu files ===
def parse_ud_blocks(ud_paths):
    all_blocks = []
    for ud_path in ud_paths:
        with open(ud_path, 'r', encoding='utf-8') as f:
            blocks = []
            block = []
            sentence_text = ""
            for line in f:
                if line.startswith("# text ="):
                    sentence_text = line.strip().replace("# text = ", "")
                    block = [line.strip()]
                elif line.strip() == "":
                    if block:
                        blocks.append((sentence_text, block))
                        block = []
                else:
                    block.append(line.strip())
            if block:
                blocks.append((sentence_text, block))
        all_blocks.extend(blocks)
    return all_blocks

# === Align sentences using exact matching and output ===
def align_and_output_exact(sentences, cpg_blocks, ud_blocks, output_path):
    ud_sentence_map = {s.strip().lower(): (s, b) for s, b in ud_blocks}

    with open(output_path, 'w', encoding='utf-8') as out:
        match_count = 0
        for idx, sentence in enumerate(sentences):
            norm_sentence = sentence.strip().lower()
            if norm_sentence in ud_sentence_map:
                matched_sentence, ud_block = ud_sentence_map[norm_sentence]
                cpg_block = cpg_blocks[idx] if idx < len(cpg_blocks) else []

                out.write(f"### Sentence ID: {idx + 1}\n")
                out.write(f"# text = {sentence}\n")
                out.write(f"# Matched UD: {matched_sentence} (Exact Match)\n")
                out.write("==UD==\n")
                for line in ud_block:
                    out.write(line + "\n")
                out.write("==CPG==\n")
                for line in cpg_block:
                    out.write(line + "\n")
                out.write("-------------------------------------------------------\n")
                match_count += 1

    print(f"✅ Exact matching complete. {match_count} sentence(s) matched.")
    print(f"📄 Output written to {output_path}")

# === Main Execution ===
if __name__ == "__main__":
    # Root directory for CPG files
    cpg_root_directory = "/content/drive/MyDrive/Training_Data_BTP/HINDI-DEPENDENCY-ALL-DOMAINS-LATEST/Data/"

    # Get all CPG files
    cpg_files = get_cpg_files_from_directory(cpg_root_directory)

    # Paths to UD files
    ud_files = [
        "/content/drive/MyDrive/Training_Data_BTP/UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu",
        "/content/drive/MyDrive/Training_Data_BTP/UD_Hindi-HDTB/hi_hdtb-ud-train.conllu",
        "/content/drive/MyDrive/Training_Data_BTP/UD_Hindi-HDTB/hi_hdtb-ud-test.conllu"
    ]

    # Output path
    output_file = "/content/drive/MyDrive/Training_Data_BTP/parallel_data_exact_match1.txt"

    # Extract and process
    sentences = extract_cpg_sentences(cpg_files)
    cpg_blocks = parse_cpg_blocks(cpg_files)
    ud_blocks = parse_ud_blocks(ud_files)

    # Perform exact alignment
    align_and_output_exact(sentences, cpg_blocks, ud_blocks, output_file)

✅ Exact matching complete. 9993 sentence(s) matched.
📄 Output written to /content/drive/MyDrive/Training_Data_BTP/parallel_data_exact_match1.txt


In [None]:
!pip install re
import re
import json

def convert_parallel_txt_to_jsonl(txt_path, jsonl_path):
    """
    Reads a file where each entry is:
      ### Sentence ID: N
      ...
      ==UD==
      [UD lines]
      ==CPG==
      [CPG lines]

    and writes out JSONL where each line is:
      { "input": "<all-of-CPG>", "target": "<all-of-UD>" }
    """
    with open(txt_path, 'r', encoding='utf-8', errors='replace') as fin, \
         open(jsonl_path, 'w', encoding='utf-8') as fout:
        content = fin.read().split("### Sentence ID:")
        for block in content:
            if not block.strip():
                continue
            # isolate UD and CPG
            ud_match = re.search(r"==UD==\n(.+?)\n==CPG==", block, re.DOTALL)
            cpg_match = re.search(r"==CPG==\n(.+?)(?:\n### Sentence ID:|\Z)", block, re.DOTALL)
            if not ud_match or not cpg_match:
                continue
            ud_text  = ud_match.group(1).strip()
            cpg_text = cpg_match.group(1).strip()
            # JSONL line
            j = { "input": cpg_text, "target": ud_text }
            fout.write(json.dumps(j, ensure_ascii=False) + "\n")
    print(f"→ Converted to {jsonl_path}")

# Example usage
convert_parallel_txt_to_jsonl("/content/drive/MyDrive/Training_Data_BTP/parallel_data_exact_match1.txt", "/content/drive/MyDrive/Training_Data_BTP/Training_Data.jsonl")


[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for re[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for json[0m[31m
[0m→ Converted to /content/drive/MyDrive/Training_Data_BTP/Training_Data.jsonl
