In [None]:
import os
import itertools
from datasets import load_dataset, Dataset, interleave_datasets
from tokenizers import Tokenizer
from itertools import islice
from datasets import get_dataset_config_names
import random
import numpy as np
import re
import zlib
from typing import List, Dict
from google import genai
import time
import nltk
import json

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
tokenizer = Tokenizer.from_file("tokenizer_clean.json")
PAD_TOKEN_ID = 0
UNK_TOKEN_ID = 1
EN_TOKEN_ID = 2
DE_TOKEN_ID = 3
EOS_TOKEN_ID = 4
MASK_TOKEN_ID = 5

In [None]:
# Similar to corresponding code in model.ipynd
# Loading en and de versions of oscar
en = load_dataset(
    'oscar',
    name='unshuffled_deduplicated_en',
    split='train',
    streaming=True
)

de = load_dataset(
    'oscar',
    name='unshuffled_deduplicated_de',
    split='train',
    streaming=True
)

# Labeling data by language
en = en.map(lambda ex: {"text": ex["text"], "lang": "en"})
de = de.map(lambda ex: {"text": ex["text"], "lang": "de"})

# Shuffling data
buffer_size = 10000 
seed = 42
en = en.shuffle(seed=seed, buffer_size=buffer_size)
de = de.shuffle(seed=seed, buffer_size=buffer_size)

# interleaving data into one dataset
streaming_dataset = interleave_datasets(
    [en, de],
    probabilities=[0.5, 0.5],
    stopping_strategy="first_exhausted",
    seed=seed
)

# stop words and paterns
GENERAL_BAD_PATTERNS = re.compile(
    r'''
    \b(
        casino|gambling|poker|betting|slots?|roulette|blackjack|baccarat|craps|freespins|bonus|jackpot|wager|no deposit|ohne einzahlung|kostenlos spielen|echtes geld|spielautomaten|spielhalle|spielbank|willkommensbonus|startguthaben|casinospiele|
        porn|porno|escort|erotic|hookup|onlyfans|nudes?|camgirls?|sexkontakte|erotik|sexchat|live sex|stripchat|webcamsex|geschlechtsverkehr|selbstbefriedigung|masturbation|pornos|pornhub|xvideos|xnxx|vibrators?|dicks?|cums?|
        fast cash|bad credits?|zinsfrei|geld leihen|kredit aufnehmen|ratenzahlung|schnellkredit|binary options?|payday loans?|payday advance|cash advance|short-term loans?|no credit check|guaranteed loan|Kurzzeitkredite?|Minikredite?|Sofortkredite?|Kredit ohne Schufa|schnelles Geld|Geld sofort|
        tinder|badoo|parship|elitepartner|lovoo|flirt|verlieben|
        test answers?|cheat sheet|homework help|buy answers?|buy exam|abitur lösung|prüfung antworten|examen lösung|
        bitcoin|ethereum|blockchain|nft|ico|airdrop|pump and dump|binance|coinbase|kraken|crypto trading|krypto|kryptowährung|
        privacy policy|terms of use|terms and conditions|all rights reserved|copyright|impressum|datenschutz|nutzungsbedingungen|alle rechte vorbehalten|cookie policy|agb|rechtliche hinweise|haftungsausschluss|
        viagra|levitra|cialis|penis|enlargement|erection|erektionsstörung|potenzmittel|libido|sexualstörung|
        weight loss|fat burning|diet pills|appetite suppressant|abnehmen|diätpillen|fettverbrennung|schnell abnehmen|
        make money online|side hustle|get rich quick|passives einkommen|geld verdienen|heimarbeit|schnell reich werden|
        click here|buy now|order now|free trial|limited offer|jetzt kaufen|hier klicken|jetzt abonnieren|kostenlos testen|nur heute
    )\b
    |
    -{3,}|={3,}|\*{3,}|
    (?:(?:\w+\s*,\s*){10,}\w+)
    ''',
    re.IGNORECASE | re.VERBOSE
)

# stop header words
BOILERPLATE_HEADER_PATTERNS = re.compile(
    r'^(?:\s*)'
    r'(you are not logged in|you do not have permission|access this page|'
    r'terms of use|privacy policy|cookies?|all rights reserved|'
    r'sign in|log in|register|create an account|register|'
    r'skip to content|main navigation|toggle navigation|'
    r'select language|choose your region|'
    r'cheap|discounts?|easy|billige|rabatte|einfach|'
    r'sie sind nicht angemeldet|kein zugriff|'
    r'zur hauptnavigation|navigation überspringen|'
    r'anmelden|einloggen|registrieren|konto erstellen|'
    r'nutzungsbedingungen|datenschutz|cookies?|'
    r'alle rechte vorbehalten|sprache auswählen|region wählen|'
    r'günstig|rabatt|einfach|schnell|kostenlos|angebot|aktionen|'
    r'jetzt anmelden|mehr erfahren|hier klicken|'
    r'help|hilfe|assist|unterstützen|call|anrufen|send|senden|respond|antworten|fill|ausfüllen)',

    re.IGNORECASE | re.MULTILINE
)

# One of the many tested filters. No useful patterns were found.
def entropy(text):
    freqs = {}
    for char in text:
        freqs[char] = freqs.get(char, 0) + 1

    total = sum(freqs.values())
    probs = [count / total for count in freqs.values()]
    return -sum(p * math.log2(p) for p in probs if p > 0)


def filter_texts(example: dict,
                 min_num_of_words = 100,
                 max_digit_ratio=0.18, 
                 min_alpha_word_ratio=0.75, 
                 max_symbol_ratio=0.1, 
                 header_check_length=200, 
                 allowed_uppercase_ratio=0.07, 
                 logging=False) -> bool:
    text = example["text"]
    lang = example["lang"]
    
    # Base len filter
    if not text or len(text) < 384:
        return False

    # Stop patterns filter
    if GENERAL_BAD_PATTERNS.search(text):
        if logging:
            print("general bad pattern")
        return False

    # Header stop words filter
    text_header = text[:header_check_length]
    if BOILERPLATE_HEADER_PATTERNS.search(text_header):
        if logging:
            print("header bad pattern")
        return False

    # Statistical filters:
    num_digits = 0
    num_alpha_words = 0
    words = text.lower().split()
    num_words = len(words)
    
    # Filter by number of total words
    if num_words < min_num_of_words:
        if logging:
            print("num words")
        return False

    # Filter ttr
    cleaned_words = [word.strip(".,!?;:`'\"") for word in words]
    ttr = len(cleaned_words) / num_words
    if num_words < 500:
        ttr_threshold = 0.4
    elif num_words < 2000:
        ttr_threshold = 0.35
    else:
        ttr_threshold = 0.3
        
    if ttr < ttr_threshold:
        if logging:
            print("unique words")
        return False
        
    # Filter by numerical digits and alpha words
    for word in words:
        num_digits += sum(c.isdigit() for c in word)
        if word.replace("`", "").replace("'", "").isalpha():
            num_alpha_words += 1

    if (num_digits / len(text)) > max_digit_ratio:
        if logging:
            print("digit words")
        return False

    if (num_alpha_words / num_words) < min_alpha_word_ratio:
        if logging:
            print("alpha ratio")
        return False

    # Filter by mean word lean
    mean_word_len = sum(len(w) for w in words) / num_words
    if not (3 < mean_word_len < (15 if lang == 'de' else 12)):
        if logging:
            print("word len")
        return False

    # Filter by uppercase ratio
    uppercase_chars_ratio = sum(1 for c in text if c.isupper()) / len(text)
    if uppercase_chars_ratio > allowed_uppercase_ratio:
        if logging:
            print("uppercase ratio")
        return False
    return True

def tokenizer_test(batch, max_len=320, min_len=256, max_avg_token_id=7000, min_avg_token_len=3.2, max_avg_token_len=5, max_unk_count=3):
    # Tokenize and filter
    texts = []
    # Text standardization
    replacements = {
        "\n": "[NL]",
        "“": '"',
        "”": '"',
        "„": '"',
        "’": "'",
        "—": "-",
        "…": "...",
        "`": "'",
        "''": '"',
        "$": "dollars",
        "€": "euros",
        "½": "1/2"
    }
    for i in range(len(batch["text"])):
        text = batch["text"][i]
        for old, new in replacements.items():
            text = text.replace(old, new)
        texts.append(text)
        
    # Get encodings
    encodings = tokenizer.encode_batch(texts, add_special_tokens=False)
    batch_test = [] 
    batch_token_ids = [None]*len(batch["text"])
    
    for i, enc in enumerate(encodings):
        token_ids = enc.ids
        max_tokens_per_text = max_len - 1
        # Trim text by max_len and filter by min_len
        if len(token_ids) > max_tokens_per_text:
            token_ids = token_ids[:max_tokens_per_text]
        if len(token_ids) < min_len:
            batch_test.append(False)
            continue

        # Filter by unc tokens
        unk_count = sum(1 for t in token_ids if t == UNK_TOKEN_ID)
        if unk_count >= max_unk_count:
            batch_test.append(False)
            continue 

        # Filter by average token id
        avg_token_id = np.mean(token_ids)
        if avg_token_id > max_avg_token_id:
            batch_test.append(False)
            continue 

        # Filter by strange tokens
        tokens_as_strings = enc.tokens
        if not tokens_as_strings:
            batch_test.append(False)
            continue 

        # Filter by average token len
        avg_token_len = sum(len(t) for t in tokens_as_strings) / len(tokens_as_strings)
        if max_avg_token_len < avg_token_len < min_avg_token_len:
            batch_test.append(False)
            continue
        # Label text as correct and add batch id
        batch_test.append(True)
        batch_token_ids[i] = token_ids
        
    return {
        "tokenizer_test": batch_test,
        "token_ids": batch_token_ids
    }

def filter_by_tokenizer_test(example):
    # Separated from tokenizer_test because it allows tokenizing texts in batches and returning token_ids.
    return example["tokenizer_test"]

streaming_dataset = streaming_dataset.filter(filter_texts)
streaming_dataset = streaming_dataset.map(tokenizer_test, batched=True, batch_size=BATCH_SIZE*8,)
streaming_dataset = streaming_dataset.filter(filter_by_tokenizer_test)

In [None]:
streaming_dataset = streaming_dataset.skip(85000)
iterator = iter(streaming_dataset)

In [None]:
item = next(iterator)
item["text"][:500]

In [None]:
os.environ["GEMINI_API_KEY"] = ''

In [None]:
def prepare_batch_requests(n, iterator):
    """
    Preparing list of requests for batch processing.
    """
    batch_requests = []
    request_metadata = {}

    for i in range(n):
        succes = False
        while not succes:
            sent_len = random.randint(1, 7)
            skip = random.randint(1, 5)
            item = next(iterator)
            translate_to = ["en", "de"]
            translate_from = item["lang"]
            translate_to.remove(translate_from)
            translate_to = translate_to[0]

            if item["lang"] == "en":
                sentences = nltk.sent_tokenize(item["text"], language='english')
            else:
                sentences = nltk.sent_tokenize(item["text"], language='german')
            
            text = ""
            try:
                sentences = sentences[skip:]
                for sentence in sentences[:sent_len]:
                    text += sentence + " "
                text = text.strip()
                if len(text) < sent_len * 108:
                    succes = True
            except IndexError:
                succes = False
        
        prompt = f"""
You are a fluent English and German translator.
Your task is to complete the following form:
Original [{translate_from}] text (may break off):
{text}
Translated to [{translate_to}] version:
Do not explain or comment. Do not add headings or labels. Only output the translated text:
        """
        
        request_key = f"request_{i}"
        
        batch_requests.append({
            "key": request_key,
            "request": {
                "contents": [{"parts": [{"text": prompt}]}]
            }
        })
        
        request_metadata[request_key] = {
            "original_text": text,
            "lang_from": translate_from,
            "lang_to": translate_to
        }
        
    return batch_requests, request_metadata

In [None]:
def process_batch_results(result_file_content, metadata):
    """
    Process results from file, getted from batch work
    """
    pairs = {"en": [], "de": []}
    
    result_lines = result_file_content.decode('utf-8').strip().split('\n')
    
    for line in result_lines:
        if not line: continue
        result = json.loads(line)
        request_key = result["key"]
        
        meta = metadata.get(request_key)
        if not meta:
            print(f"Warning: No metadata found for key {request_key}")
            continue

        if "response" in result:
            try:
                translated_text = result["response"]["candidates"][0]["content"]["parts"][0]["text"]
                pairs[meta["lang_from"]].append(meta["original_text"])
                pairs[meta["lang_to"]].append(translated_text.strip())
            except (KeyError, IndexError):
                print(f"Request with key {request_key} had an unexpected response format: {result['response']}")
        elif "status" in result:
            error_message = result["status"].get("message", "No message provided")
            error_code = result["status"].get("code", "N/A")
            print(f"Request with key {request_key} failed. Code: {error_code}, Message: {error_message}")
            
    return pairs

In [None]:
def send_data_batch(n, iterator):
    """
    Sending batch to gemini model
    """
    client = genai.Client()

    print(f"1. Preparing {n} requests...")
    requests_to_process, metadata = prepare_batch_requests(n, iterator)

    input_filename = "batch_requests.jsonl"
    with open(input_filename, "w", encoding="utf-8") as f:
        for req in requests_to_process:
            f.write(json.dumps(req) + "\n")

    print(f"2. Uploading the input file: {input_filename}")
    uploaded_file = client.files.upload(
        file=input_filename,
        config={'display_name': 'Batch Translation Requests', 'mime_type': 'jsonl'}
    )

    print("3. Creating the batch job for model...")
    batch_job = client.batches.create(
        model="gemini-2.5-flash-lite",
        src=uploaded_file.name,
        config={'display_name': "translation-job"}
    )

    print(f"   Job created with name: {batch_job.name}")
    return batch_job.name, uploaded_file.name, metadata


def collect_data_batch(batch_job_name, uploaded_file_name, metadata, cleanup=False):
    client = genai.Client()

    print("4. Waiting for the batch job to complete...")
    while True:
        batch_job = client.batches.get(name=batch_job_name)
        if batch_job.state.name in ('JOB_STATE_SUCCEEDED', 'JOB_STATE_FAILED', 'JOB_STATE_CANCELLED'):
            break
        print(f"   Current job state: {batch_job.state.name}. Polling again in 30 seconds...")
        time.sleep(30)

    print(f"   Job finished with state: {batch_job.state.name}")

    if batch_job.state.name == 'JOB_STATE_SUCCEEDED':
        result_file_name = batch_job.dest.file_name
        print(f"5. Downloading results from: {result_file_name}")
        result_content = client.files.download(file=result_file_name)

        print("6. Processing results...")
        final_pairs = process_batch_results(result_content, metadata)

        if cleanup:
            print("7. Cleaning up uploaded files...")
            client.files.delete(name=uploaded_file_name)
            client.files.delete(name=result_file_name)

        return final_pairs
    else:
        print(f"Batch job failed. Error: {batch_job.error}")
        if cleanup:
            client.files.delete(name=uploaded_file_name.replace("files/", ""))
        return {"en": [], "de": []}

In [None]:
batch_job_name_new, uploaded_file_name_new, metadata_new = send_data_batch(14000, iterator)

In [None]:
request_config = {
    "batch_job_name": batch_job_name_new,
    "uploaded_file_name": uploaded_file_name_new,
    "metadata": metadata_new
}

In [None]:
with open("request_config.json", "w", encoding="utf-8") as f:
    json.dump(request_config, f, ensure_ascii=False, indent=2)

In [None]:
with open("request_config.json", "r", encoding="utf-8") as f:
    request_config = json.load(f)

In [None]:
batch_job_name, uploaded_file_name, metadata = request_config["batch_job_name"], request_config["uploaded_file_name"], request_config["metadata"]

In [None]:
results = collect_data_batch(batch_job_name, uploaded_file_name, metadata)
with open("translations6.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

In [None]:
for i, pair in enumerate(zip(results["en"], results["de"])):
    if i<5:
        print("===============================================================")
        print(pair[0])
        print()
        print(pair[1])
        print("===============================================================")

In [None]:
with open("translations.json", "r", encoding="utf-8") as f:
    data_part1 = json.load(f)
with open("translations2.json", "r", encoding="utf-8") as f:
    data_part2 = json.load(f)
with open("translations3.json", "r", encoding="utf-8") as f:
    data_part3 = json.load(f)
with open("translations4.json", "r", encoding="utf-8") as f:
    data_part4 = json.load(f)
with open("translations5.json", "r", encoding="utf-8") as f:
    data_part5 = json.load(f)
with open("translations6.json", "r", encoding="utf-8") as f:
    data_part6 = json.load(f)
    
text_pairs = {
    "en": data_part1["en"] + data_part2["en"] + data_part3["en"] + data_part4["en"] + data_part5["en"] + data_part6["en"],
    "de": data_part1["de"] + data_part2["de"] + data_part3["de"] + data_part4["de"] + data_part5["de"] + data_part6["de"]
}

with open("oscar-en-de-synthetic.json", "w", encoding="utf-8") as f:
    json.dump(text_pairs, f, ensure_ascii=False, indent=2)