In [None]:
!pip install -q -U transformers bitsandbytes langchain langchain-community langchain-huggingface langchain-text-splitters pymupdf faiss-cpu sentence-transformers

In [None]:
import json
import re
import glob
import os
import pickle
import concurrent.futures

import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [None]:
MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507"
TRAIN_FILE = "zno.train.jsonl"
TEST_FILE = "zno.test.jsonl"
OUTPUT_FILE = "submission.csv"
DEBUG_LIMIT = 100

DATA_FOLDER = "ubertext_data"
WIKIPEDIA_URL = "https://lang.org.ua/static/downloads/ubertext2.0/wikipedia/cleansed/ubertext.wikipedia.filter_rus_gcld+short.text_only.txt.bz2"
INDEX_PATH = "faiss_index"
CHUNKS_CACHE = "chunks_cache.pkl"
EMBEDDING_MODEL = "intfloat/multilingual-e5-small"
TOP_K = 3
CHUNK_SIZE = 4000
CHUNK_OVERLAP = 400

In [None]:
!apt-get install -qq lbzip2
!mkdir -p {DATA_FOLDER}
!wget -nc -P {DATA_FOLDER} {WIKIPEDIA_URL}
!cd {DATA_FOLDER} && lbzip2 -dk *.bz2

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto"
)

In [None]:
def load_data(filepath, limit=None):
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    if limit:
        data = data[:limit]
    return data

In [None]:
ALLOWED_CHARS = set(
    "абвгґдеєжзиіїйклмнопрстуфхцчшщьюя"
    "АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ"
    "abcdefghijklmnopqrstuvwxyz"
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    "0123456789"
    " \t\n"
    ".,;:!?-—–()[]{}\"'«»…"
)

def clean_text(text):
    cleaned = ''.join(c if c in ALLOWED_CHARS else ' ' for c in text)
    cleaned = re.sub(r' +', ' ', cleaned)
    return cleaned.strip()

In [None]:
def is_valid_chunk(text, min_length=50):
    cleaned = clean_text(text)
    if len(cleaned) < min_length:
        return False

    alpha_chars = [c for c in cleaned if c.isalpha()]
    if len(alpha_chars) < 20:
        return False

    return True

In [None]:
def build_index(data_folder=DATA_FOLDER, output_path=INDEX_PATH, max_chars=None, use_cache=True):
    """
    Build FAISS index from text files.
    
    Args:
        max_chars: Limit characters per file (e.g., 1_000_000 for ~1MB). None = full file.
        use_cache: Load/save chunks from cache to skip Steps 1-2 on reruns.
    """
    import time
    
    # Try to load cached chunks
    if use_cache and os.path.exists(CHUNKS_CACHE) and max_chars is None:
        print(f"Loading cached chunks from {CHUNKS_CACHE}...")
        start = time.time()
        with open(CHUNKS_CACHE, "rb") as f:
            documents = pickle.load(f)
        print(f"  Loaded {len(documents):,} chunks in {time.time()-start:.1f}s")
    else:
        txt_files = glob.glob(f"{data_folder}/*.txt")
        if not txt_files:
            raise ValueError(f"No text files found in {data_folder}")

        print(f"Found {len(txt_files)} text file(s)")
        for f in txt_files:
            size_mb = os.path.getsize(f) / (1024 * 1024)
            print(f"  - {f}: {size_mb:.1f} MB")
        
        if max_chars:
            print(f"\n⚠️  TEST MODE: limiting to {max_chars:,} chars per file")

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
            separators=["\n\n", "\n", ".", " ", ""]
        )

        # Step 1: Read and chunk files
        print("\n[Step 1/3] Reading and chunking files...")
        all_chunks = []
        for path in txt_files:
            start = time.time()
            print(f"  Reading {os.path.basename(path)}...", end=" ", flush=True)
            with open(path, "r", encoding="utf-8") as f:
                text = f.read(max_chars) if max_chars else f.read()
            print(f"{len(text):,} chars", end=" -> ", flush=True)
            
            chunks = text_splitter.create_documents([text], metadatas=[{"source": path}])
            print(f"{len(chunks):,} chunks ({time.time()-start:.1f}s)")
            all_chunks.extend(chunks)
        
        print(f"  Total raw chunks: {len(all_chunks):,}")

        # Step 2: Filter chunks
        print("\n[Step 2/3] Filtering chunks...")
        documents = []
        for c in tqdm(all_chunks, desc="Filtering"):
            if is_valid_chunk(c.page_content):
                c.page_content = clean_text(c.page_content)
                documents.append(c)

        print(f"  Valid chunks after filtering: {len(documents):,} ({len(documents)/len(all_chunks)*100:.1f}%)")
        
        # Cache chunks for future runs
        if use_cache and max_chars is None:
            print(f"\n  Saving chunks to {CHUNKS_CACHE}...")
            with open(CHUNKS_CACHE, "wb") as f:
                pickle.dump(documents, f)
            print(f"  Cache saved!")

    # Step 3: Build embeddings and index
    print("\n[Step 3/3] Building embeddings and FAISS index...")
    print(f"  This will process {len(documents):,} chunks...")
    
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True, "batch_size": 256},
        show_progress=True
    )

    start = time.time()
    vectorstore = FAISS.from_documents(documents, embeddings)
    print(f"  Embeddings + index built in {time.time()-start:.1f}s")
    
    vectorstore.save_local(output_path)
    print(f"\nIndex saved to {output_path}: {len(documents):,} chunks")
    return vectorstore

In [None]:
def load_index(index_path=INDEX_PATH):
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True}
    )
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

In [None]:
def get_context(vectorstore, question, k=TOP_K):
    docs = vectorstore.similarity_search(f"query: {question}", k=k)
    return "\n".join([f"[{i+1}] {d.page_content}" for i, d in enumerate(docs)])

In [None]:
def format_prompt(item, tokenizer, context=""):
    question = item["question"]
    options_text = ""
    valid_markers = []

    for ans in item["answers"]:
        options_text += f"{ans['marker']}) {ans['text']}\n"
        valid_markers.append(ans["marker"])

    system_prompt = "Ви спеціаліст в українській літературі, мові та історії. Ви знаєте як складаються професійні тести на кшталт ЗНО. Перед вами питання екзамену. Визначьте правильну відповідь. Виведіть ТІЛЬКИ одну букву (А, Б, В, Г або Д) без пояснень і без крапок.\n"

    if context:
        system_prompt = system_prompt + f"Контекст:\n{context}"

    messages = [
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": f"Питання:{question}\nВаріанти:\n{options_text}"
        }
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return text, valid_markers

In [None]:
def parse_answer(content, valid_markers):
    print(content)
    match = re.search(r"([АБВГД])", content)
    if match:
        return match.group(1)
    return "А"

In [None]:
def predict(model, tokenizer, item, vectorstore=None):
    context = get_context(vectorstore, item["question"]) if vectorstore else ""
    prompt, valid_markers = format_prompt(item, tokenizer, context)

    print(prompt)

    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True).strip()
    return parse_answer(response, valid_markers)

In [None]:
def evaluate(model, tokenizer, vectorstore=None, filepath=TRAIN_FILE, limit=DEBUG_LIMIT):
    data = load_data(filepath, limit)
    correct = 0
    total = len(data)
    predictions = []

    for idx, item in enumerate(tqdm(data, desc="Evaluating")):
        predicted = predict(model, tokenizer, item, vectorstore)
        expected = item["correct_answers"][0]
        is_correct = predicted == expected
        predictions.append({"id": idx, "predicted": predicted, "expected": expected, "correct": is_correct})
        if is_correct:
            correct += 1
        tqdm.write(f"[{idx}] predicted: {predicted}, expected: {expected}, correct: {is_correct}")

    accuracy = correct / total * 100
    print(f"Accuracy: {accuracy:.2f}% ({correct}/{total})")
    return {"correct": correct, "total": total, "accuracy": accuracy, "predictions": pd.DataFrame(predictions)}

In [None]:
def create_submission(model, tokenizer, vectorstore=None, filepath=TEST_FILE, output=OUTPUT_FILE, limit=DEBUG_LIMIT):
    data = load_data(filepath, limit)
    results = []

    for idx, item in enumerate(tqdm(data, desc="Creating submission")):
        predicted = predict(model, tokenizer, item, vectorstore)
        results.append({"id": item.get("id", idx), "correct_answers": predicted})

    df = pd.DataFrame(results)
    df.to_csv(output, index=False)
    print(f"Submission saved to {output}")
    return df

In [None]:
def run_pipeline(model, tokenizer, vectorstore=None):
    eval_results = evaluate(model, tokenizer, vectorstore)
    # df = create_submission(model, tokenizer, vectorstore)

In [None]:
# Full run (will cache chunks on first run, then load from cache)
vectorstore = build_index()

# For testing with small sample:
# vectorstore = build_index(max_chars=1_000_000, use_cache=False)

In [None]:
run_pipeline(model, tokenizer, vectorstore)