In [1]:
import pandas as pd
import ollama
from tqdm import tqdm
import re
import os

In [2]:
MODEL_NAME = "gemma3:12b"
NUM_QUESTIONS = 2

def generate_multiple_qa(text):
    prompt = f"""
    [INST] <<SYS>>
    Anda adalah asisten AI yang ahli dalam bahasa Indonesia. 
    TUGAS: Buat {NUM_QUESTIONS} pertanyaan dan jawaban dari teks berita.
    ATURAN:
    1. GUNAKAN BAHASA INDONESIA SAJA
    2. Format wajib: 
    • Pertanyaan: [teks pertanyaan]
    • Jawaban: [teks jawaban] (maks 30 kata)
    3. JANGAN gunakan bahasa Inggris sama sekali
    4. Jawaban harus singkat dan relevan dengan teks
    <</SYS>>

    Teks berita:
    {text}

    Buat {NUM_QUESTIONS} pertanyaan dan jawaban dalam BAHASA INDONESIA: [/INST]
    """
    
    try:
        response = ollama.generate(
            model=MODEL_NAME,
            prompt=prompt,
            options={
                "temperature": 0.8,  # Sedikit lebih kreatif
                "num_cnt": 1024,
                "repeat_penalty": 1.3
            }
        )
        return response['response']
    except Exception as e:
        print(f"Error: {e}")
        return None

In [3]:
def parse_multiple_qa(generated_text):
    qa_pairs = []
    lines = generated_text.split('\n')
    
    for i in range(len(lines)):
        line = lines[i].strip()
        
        # Deteksi pertanyaan dengan berbagai pattern
        if any(pattern in line.lower() for pattern in ['pertanyaan', 'question', '1.', '2.', '3.']):
            # Ekstrak pertanyaan
            if ':' in line:
                question = line.split(':', 1)[1].strip()
            else:
                question = re.sub(r'^\d+\.\s*', '', line).strip()
            
            # Cari jawaban di line berikutnya
            if i + 1 < len(lines):
                next_line = lines[i + 1].strip()
                if any(pattern in next_line.lower() for pattern in ['jawaban', 'answer']):
                    if ':' in next_line:
                        answer = next_line.split(':', 1)[1].strip()
                    else:
                        answer = next_line
                    qa_pairs.append((question, answer))
    
    return qa_pairs[:NUM_QUESTIONS]

In [4]:
def process_csv_with_checkpoint(input_path, output_path):
    df = pd.read_csv(input_path, encoding='latin-1', on_bad_lines='skip', delimiter=';')
    
    # Load checkpoint jika ada
    checkpoint_file = output_path + '_checkpoint.txt'
    start_idx = 0
    
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            start_idx = int(f.read().strip())
        print(f"Resuming from index {start_idx}")
    
    results = []
    # Load existing results jika ada
    if os.path.exists(output_path):
        existing_df = pd.read_csv(output_path, encoding='utf-8')
        results = existing_df.to_dict('records')
    
    for idx in tqdm(range(start_idx, len(df)), total=len(df)-start_idx):
        try:
            row = df.iloc[idx]
            content = str(row['content'])
            
            if not content or content.strip() == '' or content == 'nan':
                continue
                
            generated = generate_multiple_qa(content)
            
            if generated:
                qa_pairs = parse_multiple_qa(generated)
                for q, a in qa_pairs:
                    results.append({
                        'content': content,
                        'question': q,
                        'answer': a
                    })
            
            # Save checkpoint setiap record
            with open(checkpoint_file, 'w') as f:
                f.write(str(idx + 1))
                
            # Auto-save setiap 10 records
            if idx % 10 == 0:
                pd.DataFrame(results).to_csv(output_path, index=False, encoding='utf-8')
                
        except Exception as e:
            print(f"Error at index {idx}: {e}")
            continue
    
    pd.DataFrame(results).to_csv(output_path, index=False, encoding='utf-8')
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)

In [5]:
# Ganti baris eksekusi terakhir dengan:
try:
    process_csv_with_checkpoint('kompas_regional.csv', 'kompas_regional_qa.csv')
except Exception as e:
    print(f"Process stopped with error: {e}")
    print()
    print("But partial results have been saved.")

Resuming from index 302


100%|██████████| 8/8 [08:35<00:00, 64.39s/it]
