In [11]:
import nest_asyncio
nest_asyncio.apply()

In [12]:
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import time

## LM Studio connection

In [13]:
WINDOWS_IP = "192.168.1.33"
PORT = 1234

try:
    client = OpenAI(
        api_key="not-needed",
        base_url=f"http://{WINDOWS_IP}:{PORT}/v1"
    )
    models = client.models.list()
    print("✓ Connected to LM Studio")
    print(f"✓ Available model: {models.data[0].id if models.data else 'None'}")
except Exception as e:
    print(f"✗ Connection error: {e}")
    exit(1)

✓ Connected to LM Studio
✓ Available model: qwen/qwen3-4b-thinking-2507


## Normalize LLM responses

In [14]:
def normalize_llm_response(text):
    """
    Normalize LLM output to 'Attack' or 'Support'
    Includes common synonyms and variations
    """
    text = text.strip().upper()
    text = ' '.join(text.split())

    # Keywords mapping
    attack_keywords = ["ATTACK", "OPPOSE", "AGAINST", "DISAGREE"]
    support_keywords = ["SUPPORT", "AGREE", "FOR", "PRO"]

    for word in attack_keywords:
        if word in text:
            return "Attack"
    for word in support_keywords:
        if word in text:
            return "Support"
    
    return "Unknown"

## Primer: 4 RBAM predictions

In [None]:
PRIMER = [
    ("We should ban plastic bags", "Plastic bags harm the environment", "Support"),
    ("We should ban plastic bags", "Plastic bags are convenient for shopping", "Attack"),
    ("Smoking is harmful", "Governments should ban cigarettes", "Support"),
    ("Immigration is beneficial", "Border restrictions harm the economy", "Support")
]

## Classify function

In [16]:
def classify_relation(arg1, arg2):
    """
    Applies the 4-step RBAM-style approach:
    1. Add primer examples.
    2. Add test pair.
    3. Feed prompt to LLM.
    4. Normalize output.
    """
    
    # Step 1 & 2: Build prompt
    prompt = "Classify the relationship as 'ATTACK' or 'SUPPORT':\n\n"
    for ex1, ex2, label in PRIMER:
        prompt += f"Arg1: {ex1}\nArg2: {ex2}\nClassification: {label.upper()}\n\n"
    
    # Step 3: Add test pair
    prompt += f"Arg1: {arg1}\nArg2: {arg2}\nClassification:"
    
    # Step 4: Call the model
    try:
        response = client.completions.create(
            model="qwen/qwen3-4b-thinking-2507",
            prompt=prompt,
            max_tokens=8,
            temperature=0.0,
            stop=["Arg1", "\n\n"]  # stop if new block starts
        )
        
        raw_text = response.choices[0].text
        return normalize_llm_response(raw_text)
    
    except Exception as e:
        print(f"Inference error: {e}")
        return "Unknown"

## Test on small examples

In [17]:
print("\n" + "="*60)
print("TESTING SAMPLE PAIRS")
print("="*60)

test_pairs = [
    ("Smoking causes health issues.", "Tobacco should be banned."),
    ("Vaccines are safe.", "We should mandate vaccinations."),
    ("Immigration is beneficial.", "Border restrictions harm the economy."),
]

for arg1, arg2 in test_pairs:
    result = classify_relation(arg1, arg2)
    print(f"Result: {result} | Arg1: {arg1} | Arg2: {arg2}")


TESTING SAMPLE PAIRS
Result: Support | Arg1: Smoking causes health issues. | Arg2: Tobacco should be banned.
Result: Support | Arg1: Vaccines are safe. | Arg2: We should mandate vaccinations.
Result: Support | Arg1: Immigration is beneficial. | Arg2: Border restrictions harm the economy.


## Load dataset and shuffle

In [None]:
df = pd.read_csv('../../data/kialo/kialo-pairs-50k.csv')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle
df = df.head(500)  # small batch for demo
print(f'\nDataset shape: {df.shape}')
print(f'Class distribution:\n{df["relation"].value_counts()}')


Dataset shape: (500, 3)
Class distribution:
relation
Support    253
Attack     247
Name: count, dtype: int64


## Classify all pairs with progress bar

In [19]:
preds = []
times = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Classifying"):
    start = time.time()
    pred = classify_relation(row["parent_clean"], row["child_clean"])
    preds.append(pred)
    times.append(time.time() - start)

df["pred"] = preds

Classifying: 100%|██████████| 500/500 [00:27<00:00, 17.93it/s]


## Statistics

In [20]:
# Print distribution of predictions
print("\n=== Distribution of predictions ===")
print(df["pred"].value_counts())

# Exclude Unknown for evaluation
df_eval = df[df["pred"] != "Unknown"]

from sklearn.metrics import classification_report
print("\n=== Classification Report (Unknown excluded) ===")
print(classification_report(df_eval["relation"], df_eval["pred"]))

# Average inference time
avg_time = sum(times) / len(times)
print(f"\nAverage inference time: {avg_time:.2f} sec per pair")
print(f"Total time for {len(df)} examples: {sum(times)/60:.1f} min")


=== Distribution of predictions ===
pred
Support    292
Attack     208
Name: count, dtype: int64

=== Classification Report (Unknown excluded) ===
              precision    recall  f1-score   support

      Attack       0.71      0.60      0.65       247
     Support       0.66      0.76      0.70       253

    accuracy                           0.68       500
   macro avg       0.68      0.68      0.68       500
weighted avg       0.68      0.68      0.68       500


Average inference time: 0.06 sec per pair
Total time for 500 examples: 0.5 min
