# 1. Data Labeling with Large Local Model (Teacher)

This notebook uses a large local model (Teacher) to label raw user messages.
We will use `Qwen/Qwen2.5-14B-Instruct` loaded in 4-bit quantization to fit on the RTX 4070 Super (12GB VRAM).

**Hardware Requirements:**
- GPU: NVIDIA RTX 4070 Super (12GB VRAM) or better.
- RAM: 32GB+ system RAM recommended.

**Inputs:**
- `user_messages.csv`: Raw data.

**Outputs:**
- `labeled_dataset.json`: Labeled data in JSON format for fine-tuning.

In [1]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from tqdm.auto import tqdm
import json
import re

# Set random seed
torch.manual_seed(42)

# Check GPU
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

CUDA Available: True
GPU: NVIDIA GeForce RTX 4070 SUPER
VRAM: 11.99 GB


In [2]:
# Load Raw Data
csv_path = "user_messages.csv"
try:
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} rows.")
except Exception as e:
    print(f"Error loading CSV: {e}")
    # Create dummy if not found for testing
    df = pd.DataFrame({
        "message_text": [
            "Напомни завтра в 14:30 позвонить маме",
            "Купить хлеб",
            "Встреча с Петром 25.10.2025 каждый вторник"
        ],
        "created_at": ["2026-02-18 10:00:00"] * 3
    })

# Filter empty or very short messages
df = df.dropna(subset=['message_text'])
df = df[df['message_text'].str.len() > 5]  # Filter very short noise
print(f"Rows after filtering: {len(df)}")

# For demonstration/piloting, we can limit the number of samples
# limit = 1000 
# df = df.head(limit)
# print(f"Processing first {len(df)} samples.")

Loaded 5040 rows.
Rows after filtering: 4976


In [3]:
# Load Teacher Model (Qwen 7B) in 4-bit
# Switching to 7B model to avoid memory offloading issues and library conflicts.
# 7B fits comfortably in 12GB VRAM without CPU offloading.
model_id = "Qwen/Qwen2.5-7B-Instruct"

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    # llm_int8_enable_fp32_cpu_offload=True # Disabled to avoid TypeError with recent transformers
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Teacher model loaded.")

Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]

Teacher model loaded.


In [4]:
# Labeling Logic
def generate_label(message, created_at, model, tokenizer):
    system_prompt = """Ты — система для извлечения параметров напоминаний из пользовательского текста.
Твоя задача:
1. Определить текст напоминания (что нужно сделать). 
2. Определить дату (date) в формате YYYY-MM-DD.
3. Определить время (time) в формате HH:MM.
4. Определить периодичность (repeat): 'daily', 'weekly', 'monthly', 'yearly' или 'none'.
5. Преобразовать относительные даты ("завтра", "через 2 часа") в абсолютные, используя текущую дату (Context Date).
6. Вернуть результат СТРОГО в формате JSON. Не добавляй никаких объяснений.

Формат JSON:
{
  "text": "...",
  "date": "YYYY-MM-DD",
  "time": "HH:MM",
  "repeat": "none"
}
Если чего-то нет, ставь null.
"""
    
    user_prompt = f"Context Date: {created_at}\nMessage: \"{message}\"\n\nJSON:"
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=256,
        temperature=0.1,  # Low temperature for deterministic/strict output
        do_sample=False
    )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

# Clean/Extract JSON from response
def extract_json(response):
    try:
        # Find JSON block using regex if model chats too much
        match = re.search(r'\{.*\}', response, re.DOTALL)
        if match:
            return json.loads(match.group(0))
        return json.loads(response)
    except:
        return None

In [6]:
# Process Dataset (Batch processing or simple loop)
labeled_data = []
success_count = 0

# Limit to a sample for demonstration if needed, otherwise process all
# process_df = df.iloc[:500] 
sample_size = 15000
if len(df) < sample_size:
    print(f"Dataset size ({len(df)}) is smaller than requested sample ({sample_size}). Using full dataset.")
    process_df = df
else:
    process_df = df.sample(sample_size) # Random sample for initial quality check

print(f"Labeling {len(process_df)} samples...")

for idx, row in tqdm(process_df.iterrows(), total=len(process_df)):
    msg = row['message_text']
    # Use existing 'created_at' or default to today's date context
    created_at = row.get('created_at', '2026-02-18') 
    
    try:
        response = generate_label(msg, created_at, model, tokenizer)
        json_data = extract_json(response)
        
        if json_data:
            # Add original input for training structure
            entry = {
                "input": msg,
                "context_date": str(created_at),2
                "output": json_data
            }
            labeled_data.append(entry)
            success_count += 1
    except Exception as e:
        # print(f"Error processing {idx}: {e}")
        continue

print(f"Finished. Successfully labeled: {success_count}/{len(process_df)}")

# Save to file
with open("labeled_dataset.json", "w", encoding="utf-8") as f:
    json.dump(labeled_data, f, ensure_ascii=False, indent=2)

print("Saved to labeled_dataset.json")

Dataset size (4976) is smaller than requested sample (15000). Using full dataset.
Labeling 4976 samples...


  0%|          | 0/4976 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Finished. Successfully labeled: 4944/4976
Saved to labeled_dataset.json
