In [23]:
import gzip
import json

def extract_sentences_as_list(gz_file, lang_name, num_sentences=50, output_file="output.json"):
    sentences = []

    with gzip.open(gz_file, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= num_sentences:
                break
            sentence = line.strip()
            if sentence:
                sentences.append(sentence)

    data = {
        lang_name: sentences  # list of strings
    }

    with open(output_file, 'w', encoding='utf-8') as out_f:
        json.dump(data, out_f, ensure_ascii=False, indent=2)

# Example usage
extract_sentences_as_list(r"C:\Users\Aish\Downloads\yo (1).txt.gz", "Yoruba", num_sentences=50)


In [9]:
import bz2
import csv
import json

def extract_long_bengali_sentences_from_bz2(input_file, output_file="bengali_sentences.json", num_sentences=50):
    all_sentences = []

    with bz2.open(input_file, 'rt', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            if len(row) < 3:
                continue
            # Join tokens from column 3 onwards (skip ID and 'ben')
            sentence = " ".join(row[2:]).strip()
            if sentence:
                all_sentences.append(sentence)

    # Sort by length (number of words), descending
    all_sentences.sort(key=lambda s: len(s.split()), reverse=True)

    # Take top N
    selected_sentences = all_sentences[:num_sentences]

    data = {"Bengali": selected_sentences}

    with open(output_file, 'w', encoding='utf-8') as out_f:
        json.dump(data, out_f, ensure_ascii=False, indent=2)

# Example usage:
extract_long_bengali_sentences_from_bz2(r"C:\Users\Aish\Downloads\ben_sentences.tsv.bz2")


In [30]:
import bz2
import gzip
import csv
import json
import os

def extract_sentences_from_gz(file_path, lang_name, num_sentences=50):
    sentences = []
    with gzip.open(file_path, 'rt', encoding='utf-8', errors='ignore') as f:
        for line in f:
            sentence = line.strip()
            if sentence and len(sentence)>=20:
                sentences.append(sentence)
            if len(sentences) >= num_sentences:
                break
    return sentences

def extract_sentences_from_bz2_tsv(file_path, lang_name, num_sentences=50):
    all_sentences = []
    with bz2.open(file_path, 'rt', encoding='utf-8', errors='ignore') as f:
        reader = csv.reader((line.replace('\x00', '') for line in f), delimiter='\t')
        for row in reader:
            if len(row) < 3:
                continue
            sentence = " ".join(row[2:]).strip()
            if sentence:
                all_sentences.append(sentence)
    all_sentences.sort(key=lambda s: len(s.split()), reverse=True)
    return all_sentences[:num_sentences]

def process_all_languages(file_info, output_file="final_multilang_sentences.json", num_sentences=50):
    final_data = {}

    for lang_name, file_path in file_info:
        print(f"📂 Processing {lang_name}...")
        try:
            if file_path.endswith('.gz'):
                sentences = extract_sentences_from_gz(file_path, lang_name, num_sentences)
            elif file_path.endswith('.bz2'):
                sentences = extract_sentences_from_bz2_tsv(file_path, lang_name, num_sentences)
            else:
                print(f"⚠️ Unsupported format for {lang_name}: {file_path}")
                continue

            final_data[lang_name] = sentences
        except Exception as e:
            print(f"❌ Error processing {lang_name}: {e}")
            final_data[lang_name] = []

    # Write to final JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(final_data, f, ensure_ascii=False, indent=2)
    print(f"\n✅ Final JSON written to {output_file}")

# Example input mapping
file_info = [
    ("Arabic", r"C:\Users\Aish\Downloads\ar.txt.gz"),
    ("Mandarin_Chinese", r"C:\Users\Aish\Downloads\zh.txt.gz"),
    ("Russian", r"C:\Users\Aish\Downloads\ru.txt.gz"),
    ("Hindi", r"C:\Users\Aish\Downloads\hi.txt.gz"),
    ("Japanese", r"C:\Users\Aish\Downloads\ja.txt.gz"),
    ("Swahili", r"C:\Users\Aish\Downloads\sw.txt.gz"),
    ("Yoruba", r"C:\Users\Aish\Downloads\yor_sentences.tsv.bz2"),
    ("Turkish", r"C:\Users\Aish\Downloads\tr.txt.gz"),
    ("Bengali", r"C:\Users\Aish\Downloads\ben_sentences.tsv.bz2")
]

# Run the processor
process_all_languages(file_info, output_file="final_multilang_sentences.json", num_sentences=50)


📂 Processing Arabic...
📂 Processing Mandarin_Chinese...
📂 Processing Russian...
📂 Processing Hindi...
📂 Processing Japanese...
📂 Processing Swahili...
📂 Processing Yoruba...
📂 Processing Turkish...
📂 Processing Bengali...

✅ Final JSON written to final_multilang_sentences.json
