In [14]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Make both GPUs visible
os.environ["HF_HOME"] = "/home/shared/huggingface"
print(os.getenv("HF_HOME"))
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:true"


/home/shared/huggingface


In [15]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available
print(torch.cuda.device_count())  # Number of GPUs available


True
2


In [16]:
import pandas as pd
import json
import random
from typing import List, Dict, Tuple
import numpy as np
from sklearn.utils import shuffle
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import time
import csv
from sentence_transformers import SentenceTransformer

# FEW-SHOT EXAMPLE BY EMBEDDING

## Utilities

In [17]:
def read_jsonl(path):
    arr = []
    with open(path, "r", encoding="utf-8") as f:
        for ln in f:
            if ln.strip():
                arr.append(json.loads(ln))
    return arr

def write_jsonl(objs, outpath):
    with open(outpath, "w", encoding="utf-8") as f:
        for o in objs:
            f.write(json.dumps(o, ensure_ascii=False) + "\n")

def read_csv_rows(path, text_col="preprocessed", label_col="compact_label", id_col="id"):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            rows.append({
                "id": r.get(id_col, ""),
                "text": r.get(text_col, "").strip(),
                "label": r.get(label_col, "").strip()
            })
    return rows

## GET_EXAMPLE_BY_EMBEDDING

In [18]:
import pandas as pd
import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer, SimilarityFunction

def get_example_bert_multilabel(test_file: str, 
                                 train_file: str, 
                                 num_of_example: int = 10, 
                                 output_file: str = "test_with_example_bert.jsonl",
                                 bert_model: str = "paraphrase-multilingual-MiniLM-L12-v2",
                                 sim_function: str = "cosine",
                                 drop_duplicate_text: bool = True):
    """
    Generate few-shot examples using BERT-based semantic similarity for multi-label classification.
    
    Args:
        test_file: Path to test CSV file
        train_file: Path to train CSV file
        num_of_example: Number of examples to select for each label combination
        output_file: Path to output JSONL file
        bert_model: SentenceTransformer model name
        sim_function: Similarity function ('cosine', 'dot', 'euclidean', 'manhattan')
        drop_duplicate_text: Whether to drop duplicate texts from examples
    """
    # Load the datasets
    test_df = pd.read_csv(test_file)
    train_df = pd.read_csv(train_file)

    # Drop NaN in 'preprocessed' from train set only
    train_df = train_df.dropna(subset=['preprocessed'])

    # For test set, replace NaN in 'preprocessed' with 'text'
    test_df['preprocessed'] = test_df.apply(
        lambda row: row['text'] if pd.isna(row['preprocessed']) else row['preprocessed'],
        axis=1
    )

    # Define label mapping
    label_mapping_hsal = {
        (1, 1): "HsAl",
        (1, 0): "HsnAl",
        (0, 1): "nHsAl",
        (0, 0): "nHsnAl"
    }

    # Filter train dataset based on labels
    hsal_1_df = train_df[(train_df['final_label_hs'] == 1) & (train_df['final_label_al'] == 1)].reset_index(drop=True)
    hsal_2_df = train_df[(train_df['final_label_hs'] == 1) & (train_df['final_label_al'] == 0)].reset_index(drop=True)
    hsal_3_df = train_df[(train_df['final_label_hs'] == 0) & (train_df['final_label_al'] == 1)].reset_index(drop=True)
    hsal_4_df = train_df[(train_df['final_label_hs'] == 0) & (train_df['final_label_al'] == 0)].reset_index(drop=True)

    # Drop duplicates if requested
    if drop_duplicate_text:
        hsal_1_df = hsal_1_df.drop_duplicates(subset=['preprocessed'], keep='first').reset_index(drop=True)
        hsal_2_df = hsal_2_df.drop_duplicates(subset=['preprocessed'], keep='first').reset_index(drop=True)
        hsal_3_df = hsal_3_df.drop_duplicates(subset=['preprocessed'], keep='first').reset_index(drop=True)
        hsal_4_df = hsal_4_df.drop_duplicates(subset=['preprocessed'], keep='first').reset_index(drop=True)

    # Ensure there are enough samples
    categories = {
        'HsAl': hsal_1_df,
        'HsnAl': hsal_2_df,
        'nHsAl': hsal_3_df,
        'nHsnAl': hsal_4_df
    }
    
    for cat_name, cat_df in categories.items():
        if len(cat_df) < num_of_example:
            raise ValueError(f"Not enough examples for {cat_name}: found {len(cat_df)}, need {num_of_example}")

    # Initialize model
    print(f"Loading BERT model: {bert_model}")
    if sim_function == "cosine":
        model = SentenceTransformer(bert_model, similarity_fn_name=SimilarityFunction.COSINE)
    elif sim_function == "dot":
        model = SentenceTransformer(bert_model, similarity_fn_name=SimilarityFunction.DOT_PRODUCT)
    elif sim_function == "euclidean":
        model = SentenceTransformer(bert_model, similarity_fn_name=SimilarityFunction.EUCLIDEAN)
    elif sim_function == "manhattan":
        model = SentenceTransformer(bert_model, similarity_fn_name=SimilarityFunction.MANHATTAN)
    else:
        raise ValueError("Wrong 'sim_function' parameter. Only 'cosine', 'dot', 'euclidean', or 'manhattan' is allowed.")

    # BATCH ENCODE ALL TEXTS ONCE
    print(f"Encoding {len(test_df)} test instances...")
    test_embeddings = model.encode(test_df['preprocessed'].tolist(), show_progress_bar=True)

    print(f"Encoding training examples for each category...")
    hsal_1_embeddings = model.encode(hsal_1_df['preprocessed'].tolist(), show_progress_bar=True)
    hsal_2_embeddings = model.encode(hsal_2_df['preprocessed'].tolist(), show_progress_bar=True)
    hsal_3_embeddings = model.encode(hsal_3_df['preprocessed'].tolist(), show_progress_bar=True)
    hsal_4_embeddings = model.encode(hsal_4_df['preprocessed'].tolist(), show_progress_bar=True)

    # COMPUTE ALL SIMILARITIES IN BATCH
    print("Computing similarity scores...")
    hsal_1_similarities = model.similarity(test_embeddings, hsal_1_embeddings).cpu().numpy()
    hsal_2_similarities = model.similarity(test_embeddings, hsal_2_embeddings).cpu().numpy()
    hsal_3_similarities = model.similarity(test_embeddings, hsal_3_embeddings).cpu().numpy()
    hsal_4_similarities = model.similarity(test_embeddings, hsal_4_embeddings).cpu().numpy()

    # Ensure the output directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)

    print(f"Generating examples for {len(test_df)} test instances...")

    # Process each test instance
    with open(output_file, "w", encoding="utf-8") as f:
        for idx, row in test_df.iterrows():
            # Get similarity scores for this instance
            hsal_1_sims = hsal_1_similarities[idx]
            hsal_2_sims = hsal_2_similarities[idx]
            hsal_3_sims = hsal_3_similarities[idx]
            hsal_4_sims = hsal_4_similarities[idx]

            # Get top N indices based on similarity
            # For cosine/dot: higher is better, for euclidean/manhattan: lower is better
            if sim_function in ["cosine", "dot"]:
                top_hsal_1_indices = np.argsort(hsal_1_sims)[-num_of_example:][::-1]
                top_hsal_2_indices = np.argsort(hsal_2_sims)[-num_of_example:][::-1]
                top_hsal_3_indices = np.argsort(hsal_3_sims)[-num_of_example:][::-1]
                top_hsal_4_indices = np.argsort(hsal_4_sims)[-num_of_example:][::-1]
            else:  # euclidean, manhattan
                top_hsal_1_indices = np.argsort(hsal_1_sims)[:num_of_example]
                top_hsal_2_indices = np.argsort(hsal_2_sims)[:num_of_example]
                top_hsal_3_indices = np.argsort(hsal_3_sims)[:num_of_example]
                top_hsal_4_indices = np.argsort(hsal_4_sims)[:num_of_example]

            # Extract examples with proper label mapping
            bert_ex_hsal = [
                (hsal_1_df['preprocessed'].iloc[i], 
                 label_mapping_hsal[(hsal_1_df['final_label_hs'].iloc[i], hsal_1_df['final_label_al'].iloc[i])])
                for i in top_hsal_1_indices
            ]

            bert_ex_hsnal = [
                (hsal_2_df['preprocessed'].iloc[i], 
                 label_mapping_hsal[(hsal_2_df['final_label_hs'].iloc[i], hsal_2_df['final_label_al'].iloc[i])])
                for i in top_hsal_2_indices
            ]

            bert_ex_nhsal = [
                (hsal_3_df['preprocessed'].iloc[i], 
                 label_mapping_hsal[(hsal_3_df['final_label_hs'].iloc[i], hsal_3_df['final_label_al'].iloc[i])])
                for i in top_hsal_3_indices
            ]

            bert_ex_nhsnal = [
                (hsal_4_df['preprocessed'].iloc[i], 
                 label_mapping_hsal[(hsal_4_df['final_label_hs'].iloc[i], hsal_4_df['final_label_al'].iloc[i])])
                for i in top_hsal_4_indices
            ]

            # Create output dictionary
            output = {
                "idx": int(idx),
                "text": row['preprocessed'],
                "label": label_mapping_hsal[(row['final_label_hs'], row['final_label_al'])],
                "bert_ex_hsal": bert_ex_hsal,
                "bert_ex_hsnal": bert_ex_hsnal,
                "bert_ex_nhsal": bert_ex_nhsal,
                "bert_ex_nhsnal": bert_ex_nhsnal
            }

            f.write(json.dumps(output, ensure_ascii=False) + "\n")

            # Progress indicator
            if (idx + 1) % 100 == 0:
                print(f"Processed {idx + 1}/{len(test_df)} instances")

    print(f"✓ {output_file} has been created successfully with BERT-based examples.")

In [19]:
get_example_bert_multilabel(
    test_file="/home/alvaro_luqman/resource/Datasets/HsAl/preprocessed_test.csv",
    train_file="/home/alvaro_luqman/resource/Datasets/HsAl/preprocessed_train.csv",
    num_of_example=5,
    output_file="/home/alvaro_luqman/resource/Inference Results/HsAl/SahabatAI/test_with_example_bert.jsonl",
    bert_model="indobenchmark/indobert-large-p2",  # Good for Indonesian
    sim_function="cosine",  # or try "manhattan" based on your findings!
    drop_duplicate_text=True
)

Loading BERT model: indobenchmark/indobert-large-p2


No sentence-transformers model found with name indobenchmark/indobert-large-p2. Creating a new one with mean pooling.


Encoding 4634 test instances...


Batches: 100%|██████████| 145/145 [00:08<00:00, 17.19it/s]


Encoding training examples for each category...


Batches: 100%|██████████| 144/144 [00:08<00:00, 16.56it/s]
Batches: 100%|██████████| 57/57 [00:03<00:00, 18.95it/s]
Batches: 100%|██████████| 164/164 [00:07<00:00, 20.88it/s]
Batches: 100%|██████████| 202/202 [00:13<00:00, 15.12it/s]


Computing similarity scores...
Generating examples for 4634 test instances...
Processed 100/4634 instances
Processed 200/4634 instances
Processed 300/4634 instances
Processed 400/4634 instances
Processed 500/4634 instances
Processed 600/4634 instances
Processed 700/4634 instances
Processed 800/4634 instances
Processed 900/4634 instances
Processed 1000/4634 instances
Processed 1100/4634 instances
Processed 1200/4634 instances
Processed 1300/4634 instances
Processed 1400/4634 instances
Processed 1500/4634 instances
Processed 1600/4634 instances
Processed 1700/4634 instances
Processed 1800/4634 instances
Processed 1900/4634 instances
Processed 2000/4634 instances
Processed 2100/4634 instances
Processed 2200/4634 instances
Processed 2300/4634 instances
Processed 2400/4634 instances
Processed 2500/4634 instances
Processed 2600/4634 instances
Processed 2700/4634 instances
Processed 2800/4634 instances
Processed 2900/4634 instances
Processed 3000/4634 instances
Processed 3100/4634 instances
P

# **EXAMPLE_GENERATOR.PY**

In [20]:
def example_generator_hsal(ex_hsal, ex_hs, ex_al, ex_non_hsal, prompt_variant):
    example = ''
    str_prompt = str(prompt_variant)
    num_ex = int(str_prompt[2])

    # Ambil jumlah contoh yang diminta dari setiap kategori
    list_hsal = ex_hsal[:num_ex]
    list_hs = ex_hs[:num_ex]
    list_al = ex_al[:num_ex]
    list_non_hsal = ex_non_hsal[:num_ex]

    # Gabungkan secara bergiliran satu per kategori
    list_ex = []
    for i in range(num_ex):
        if i < len(list_hsal):
            list_ex.append(list_hsal[i])
        if i < len(list_hs):
            list_ex.append(list_hs[i])
        if i < len(list_al):
            list_ex.append(list_al[i])
        if i < len(list_non_hsal):
            list_ex.append(list_non_hsal[i])

    # Format hasil akhir
    for i, (teks, label_kode) in enumerate(list_ex):
        if label_kode == 'HsAl':
            label = 'ujaran_kebencian_kasar'
        elif label_kode == 'HsnAl':
            label = 'ujaran_kebencian'
        elif label_kode == 'nHsAl':
            label = 'ujaran_kasar'
        else:
            label = 'bukan_ujaran_kebencian_kasar'
        example += f"{i+1}. Teks: '{teks}'. Jawaban: '{label}'\n"

    return example

def example_generator_hs(ex_hs, ex_non_hs, prompt_variant):
    example = ''
    str_prompt = str(prompt_variant)
    num_ex = int(str_prompt[2])*2

    list_hs = ex_hs[:num_ex]
    list_non_hs = ex_non_hs[:num_ex]

    # Gabungkan secara bergiliran
    list_ex = []
    for i in range(num_ex):
        if i < len(list_hs):
            list_ex.append(list_hs[i])
        if i < len(list_non_hs):
            list_ex.append(list_non_hs[i])
    
    # Format hasil akhir
    for i, (teks, label_kode) in enumerate(list_ex):
        if label_kode == 'HS':
            label = 'ujaran_kebencian'
        else:
            label = 'bukan_ujaran_kebencian'
        example += f"{i+1}. Teks: '{teks}'. Jawaban: '{label}'\n"
    
    return example

def example_generator_al(ex_al, ex_non_al, prompt_variant):
    example = ''
    str_prompt = str(prompt_variant)
    num_ex = int(str_prompt[2])*2

    list_al = ex_al[:num_ex]
    list_non_al = ex_non_al[:num_ex]
    
    # Gabungkan secara bergiliran
    list_ex = []
    for i in range(num_ex):
        if i < len(list_al):
            list_ex.append(list_al[i])
        if i < len(list_non_al):
            list_ex.append(list_non_al[i])
    
    # Format hasil akhir
    for i, (teks, label_kode) in enumerate(list_ex):
        if label_kode == 'Al':
            label = 'ujaran_kasar'
        else:
            label = 'bukan_ujaran_kasar'
        example += f"{i+1}. Teks: '{teks}'. Jawaban: '{label}'\n"
    
    return example

# **PROMPT_APPROACH_1_ZERO.PY**

In [21]:
def prompt_approach_1_zero_hs (list_inference_input, prompt_variant):
    if str(prompt_variant).startswith('1'):
        return f"""
Tugas: Diberikan sebuah teks pada input berikut, tentukan apakah teks tersebut mengandung ujaran kebencian atau tidak.

Instruksi:
- Keluaran label jawaban hanya dapat berupa ‘ujaran_kebencian’ untuk teks yang mengandung ujaran kebencian, atau ‘bukan_ujaran_kebencian’ untuk teks yang tidak mengandung ujaran kebencian.
- Mohon untuk tidak memberikan penjelasan atas jawaban Anda.

Giliran Anda:
Input: 
- Teks: '{list_inference_input[1]}' 
Jawaban:
"""

def prompt_approach_1_zero_al (list_inference_input, prompt_variant):
    if str(prompt_variant).startswith('1'):
        return f"""
Tugas: Diberikan sebuah teks pada input berikut, tentukan apakah teks tersebut mengandung ujaran kasar atau tidak
Instruksi:
- Keluaran label jawaban hanya dapat berupa ‘ujaran_kasar’ untuk teks yang mengandung ujaran kasar, atau ‘bukan_ujaran_kasar’ untuk teks yang tidak mengandung ujaran kasar.
- Mohon untuk tidak memberikan penjelasan atas jawaban Anda.

Giliran Anda:
Input: 
- Teks: '{list_inference_input[1]}' 
Jawaban:
"""

# **PROMPT_APPROACH_2_ZERO.PY**


In [22]:
def prompt_approach_2_zero (list_inference_input, prompt_variant):
    if str(prompt_variant).startswith('1'):
        return f"""
Tugas: Diberikan sebuah teks pada input berikut, tentukan apakah teks tersebut mengandung ujaran kebencian atau tidak, dan mengandung ujaran kasar atau tidak.

Instruksi:
- Keluaran label jawaban hanya dapat berupa ‘ujaran_kebencian’ untuk teks yang mengandung ujaran kebencian namun tidak mengandung ujaran kasar, ‘ujaran_kasar’ untuk teks yang mengandung ujaran kasar namun tidak mengandung ujaran kebencian, ‘ujaran_kebencian_kasar’ untuk teks yang mengandung ujaran kebencian dan ujaran kasar, atau ‘bukan_ujaran_kebencian_kasar’ untuk teks yang tidak mengandung ujaran kebencian maupun ujaran kasar.
- Mohon untuk tidak memberikan penjelasan atas jawaban Anda.

Giliran Anda:
Input:
- Teks: '{list_inference_input[1]}'
Jawaban:
"""

# **PROMPT_APPROACH_1_FEW.PY**

In [23]:
def prompt_approach_1_few_hs(list_inference_input,prompt_variant):
  formatted_example = example_generator_hs(list_inference_input[3],list_inference_input[4],prompt_variant)
  if str(prompt_variant).startswith('1'):
    return f"""
Tugas: Diberikan sebuah teks pada input berikut, tentukan apakah teks tersebut mengandung ujaran kebencian atau tidak. Anda akan diberikan beberapa contoh.

Instruksi:
- Keluaran label jawaban hanya dapat berupa ‘ujaran_kebencian’ untuk teks yang mengandung ujaran kebencian, atau ‘bukan_ujaran_kebencian’ untuk teks yang tidak mengandung ujaran kebencian.
- Contoh yang diberikan dapat membantu Anda dalam menentukan jawaban.
- Mohon untuk tidak memberikan penjelasan atas jawaban Anda.

Berikut contohnya:
{formatted_example}
Giliran Anda:
Input:
- Teks: '{list_inference_input[1]}'
Jawaban:
""" 

def prompt_approach_1_few_al(list_inference_input,prompt_variant):
  formatted_example = example_generator_al(list_inference_input[3],list_inference_input[4],prompt_variant)
  if str(prompt_variant).startswith('1'):
    return f"""
Tugas: Diberikan sebuah teks pada input berikut, tentukan apakah teks tersebut mengandung ujaran kasar atau tidak. Anda akan diberikan beberapa contoh.

Instruksi:
- Keluaran label jawaban hanya dapat berupa ‘ujaran_kasar’ untuk teks yang mengandung ujaran kasar, atau ‘bukan_ujaran_kasar’ untuk teks yang tidak mengandung ujaran kasar.
- Contoh yang diberikan dapat membantu Anda dalam menentukan jawaban.
- Mohon untuk tidak memberikan penjelasan atas jawaban Anda.

Berikut contohnya:
{formatted_example}
Giliran Anda:
Input:
- Teks: '{list_inference_input[1]}'
Jawaban:
"""

# **PROMPT_APPROACH_2_FEW.PY**

In [24]:
def prompt_approach_2_few(list_inference_input, prompt_variant):
    formatted_example = example_generator_hsal(
        list_inference_input[3],
        list_inference_input[4],
        list_inference_input[5],
        list_inference_input[6],
        prompt_variant
    )
    
    if str(prompt_variant).startswith('1'):
        return f"""
Tugas: Diberikan sebuah teks pada input berikut, tentukan apakah teks tersebut mengandung ujaran kebencian atau tidak, dan mengandung ujaran kasar atau tidak.

Instruksi:
- Keluaran label jawaban hanya dapat berupa 'ujaran_kebencian' untuk teks yang mengandung ujaran kebencian namun tidak mengandung ujaran kasar, 'ujaran_kasar' untuk teks yang mengandung ujaran kasar namun tidak mengandung ujaran kebencian, 'ujaran_kebencian_kasar' untuk teks yang mengandung ujaran kebencian dan ujaran kasar, atau 'bukan_ujaran_kebencian_kasar' untuk teks yang tidak mengandung ujaran kebencian maupun ujaran kasar.
- Contoh yang diberikan dapat membantu Anda dalam menentukan jawaban.
- Mohon untuk tidak memberikan penjelasan atas jawaban Anda.

Berikut contohnya:
{formatted_example}
Giliran Anda:
Input:
- Teks: '{list_inference_input[1]}'
Jawaban:
"""

# **GET_PROMPT.PY**

In [25]:
def get_prompt(list_inference_input,prompt_approach_type,prompt_variant):
  # list_inference_input : list, prompt_approach_type : string, prompt_variant : integer)
  if prompt_approach_type == "approach_1_zero_hs":
    return prompt_approach_1_zero_hs(list_inference_input, prompt_variant)
  if prompt_approach_type == "approach_1_few_hs":
    return prompt_approach_1_few_hs(list_inference_input,prompt_variant)
  if prompt_approach_type == "approach_1_zero_al":
    return prompt_approach_1_zero_al(list_inference_input, prompt_variant)
  if prompt_approach_type == "approach_1_few_al":
    return prompt_approach_1_few_al(list_inference_input,prompt_variant)
  if prompt_approach_type == "approach_2_zero":
    return prompt_approach_2_zero(list_inference_input, prompt_variant)
  if prompt_approach_type == "approach_2_few":
    return prompt_approach_2_few(list_inference_input,prompt_variant)

# **main.py**

In [26]:
import json

def get_jsonl_keys(jsonl_file):
    with open(jsonl_file, "r") as f:
        first_line = json.loads(f.readline())  # Membaca satu objek JSON pertama
        keys = list(first_line.keys())  # Mendapatkan semua atribut
        return keys

In [27]:
import pandas as pd
import json
import random
from sklearn.utils import shuffle
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
from huggingface_hub import login

In [28]:
def get_jsonl_keys(jsonl_file):
    with open(jsonl_file, "r") as f:
        first_line = json.loads(f.readline())  # Membaca satu objek JSON pertama
        keys = list(first_line.keys())  # Mendapatkan semua atribut
        return keys

def load_model_tokenizer(model_name, hf_token=""):
    if hf_token != "":
        login(token=hf_token)
        flag_auth_token = True
    else:
        flag_auth_token = False
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        cache_dir = "/home/shared/huggingface",
        token=hf_token if flag_auth_token else None  # Using token instead of use_auth_token
    )
    
    # Load model with proper configuration for multi-GPU
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir = "/home/shared/huggingface",
        device_map="auto",  # Automatically distribute across GPUs
        token=hf_token if flag_auth_token else None,  # Using token instead of use_auth_token
        torch_dtype=torch.bfloat16,  # as recommended by the documentation of SahabatAi
        # torch_dtype="auto",
        # low_cpu_mem_usage=True,      # Reduce CPU memory usage
    )
    
    
    # Set pad token if needed
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
        
    return model, tokenizer
    
def llm_inference_greedy_search(prompt,tokenizer,model,gpu_device="",max_new_tokens=250,return_mode="with_subtoken_score"):
    if hasattr(model, "hf_device_map"):
        # The model is already distributed, so we'll just use the current device mapping
        device = next(iter(model.parameters())).device  # Get device of first parameter for input tensor
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
    else:
        # Handle single device case
        if gpu_device is None or gpu_device == "":
            device = "cpu"
        elif isinstance(gpu_device, int):
            device = f"cuda:{gpu_device}"
        elif gpu_device.lower() == "cuda":
            device = "cuda"
        else:
            raise ValueError(f"Invalid gpu_device: {gpu_device}")
        
        model = model.to(device)
        inputs = tokenizer([prompt], return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, return_dict_in_generate=True, output_scores=True)
    transition_scores = model.compute_transition_scores(
        outputs.sequences, outputs.scores, normalize_logits=True
    )
    input_length = inputs.input_ids.shape[1]
    generated_tokens = outputs.sequences[:, input_length:]
    original_answer = tokenizer.batch_decode(generated_tokens)[0]
    if return_mode == "without_subtoken_score":
        return original_answer
    elif return_mode == "with_subtoken_score":
        list_subtoken,list_subtoken_score = [],[]
        for tok, score in zip(generated_tokens[0], transition_scores[0]):
            list_subtoken.append(tokenizer.decode(tok))
            if device == "cpu":
                list_subtoken_score.append(float(np.exp(score.numpy())))
            else:
                list_subtoken_score.append(float(np.exp(score.cpu().numpy())))
        return original_answer, list_subtoken, list_subtoken_score
    else:
        raise ValueError("Wrong `return_mode`. Please type `with_subtoken_score` if you want to get the sub token score, or `without_subtoken_score` if not.")

def write_jsonl(jsons, output_filename):
  with open(output_filename, "w") as f:
    for each_json in jsons:
      json.dump(each_json,f)
      f.write("\n")

def read_jsonl(filename):
  result = []
  with open(filename, "r") as f:
    for line in f.readlines():
      result.append(json.loads(line))
  return result

def llm_inference_bulk(input_file_path,list_inference_attribute,prompt_task_type,prompt_variant,model_name,gpu_device="",hf_token="",max_new_tokens=250,return_mode="with_subtoken_score",verbose="yes"):
    # Load model and tokenizer
    if verbose == "yes":
        print(f"Loading model and tokenizer from pretrained model: {model_name}")
    model,tokenizer = load_model_tokenizer(model_name,hf_token)
    # Read jsonl data
    jsonl_data = read_jsonl(input_file_path)
    if verbose == "yes":
        len_jsonl_data = len(jsonl_data)
    # Loop the inference process for all data in the jsonl file
    start_time = time.time()  # Record the start time
    max_runtime = 11.5 * 60 * 60 
    for js_idx in range(0, len(jsonl_data)):
        loop_start_time = time.time()
        if verbose == "yes":
            print(f"Processing file {js_idx+1} of {len_jsonl_data} total texts.")
        # Get list of inference input to generate the prompt
        list_inference_input = []
        for attribute in list_inference_attribute:
            list_inference_input.append(jsonl_data[js_idx].get(attribute))
        prompt = get_prompt(list_inference_input,prompt_task_type,prompt_variant)
        print(prompt)
        try:
            if return_mode == "without_subtoken_score":
                original_answer = llm_inference_greedy_search(prompt,tokenizer,model,max_new_tokens,return_mode)
                jsonl_data[js_idx]["original_answer"] = original_answer
                print(original_answer)
            elif return_mode == "with_subtoken_score":
                original_answer, list_subtoken, list_subtoken_score = llm_inference_greedy_search(prompt,tokenizer,model,gpu_device,max_new_tokens,return_mode)
                jsonl_data[js_idx]["original_answer"] = original_answer
                jsonl_data[js_idx]["list_subtoken"] = list_subtoken
                jsonl_data[js_idx]["list_subtoken_score"] = list_subtoken_score
                print(original_answer)
            else:
                raise ValueError("Wrong `return_mode`. Please type `with_subtoken_score` if you want to get the sub token score, or `without_subtoken_score` if not.")
        except Exception as e:
            print(e)
            jsonl_data[js_idx]["original_answer"] = "failed_to_get_inference_result"
            print(f"Failed to get inference result due to Out of Memory (OOM) on json_idx line: {js_idx+1}. Please check your input length. You may need change llm architecture or prune your input.")
        loop_end_time = time.time()  # End timer for the loop
        loop_duration = loop_end_time - loop_start_time
        print(f"Processing time for file {js_idx+1}: {loop_duration:.2f} seconds")
    print("Inference bulk process is done.")
    return jsonl_data

def llm_inference_bulk_file2file(input_file_path,output_file_path,list_inference_attribute,prompt_task_type,prompt_variant,model_name,gpu_device="",hf_token="",max_new_tokens=250,return_mode="with_subtoken_score",verbose="yes"):
    inference_result = llm_inference_bulk(input_file_path,list_inference_attribute,prompt_task_type,prompt_variant,model_name,gpu_device,hf_token,max_new_tokens,return_mode,verbose)
    # if verbose =="yes":
    print("Process to save inference result into desired destination.")
    write_jsonl(inference_result,output_file_path)
    # if verbose == "yes":
    print(f"The inference result has been saved into: {output_file_path}")

def main():
    # Get all arguments
    # input_file_path = args.input_file_path
    # output_file_path = args.output_file_path
    # list_inference_attribute = args.list_inference_attribute
    # list_inference_attribute = list_inference_attribute.split(",")
    # prompt_task_type = args.prompt_task_type
    # prompt_variant = args.prompt_variant
    # model_name = args.model_name
    # gpu_device = args.gpu_device
    # if gpu_device == "":
    #     print("We will run inference process on CPU device.")
    # else:
    #     print(f"We will run inference process on GPU: {gpu_device} device.")
    # hf_token = args.hf_token
    # max_new_tokens = args.max_new_tokens
    # return_mode = args.return_mode
    # verbose = args.verbose
    # # Bulk inference process
    input_file_path = '/home/alvaro_luqman/resource/Inference Results/HsAl/SahabatAI/test_with_example_bert.jsonl'
    output_file_path  = '/home/alvaro_luqman/resource/Inference Results/HsAl/SahabatAI/output_hsal_sahabatai_few_bert_embed.jsonl'
    list_inference_attribute = get_jsonl_keys(input_file_path)
    prompt_task_type = "approach_2_few"
    prompt_variant = 111 #115
    model_name = 'GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct'
    gpu_device = ''
    hf_token = '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'
    max_new_tokens = 15 # 15
    return_mode = 'with_subtoken_score'
    verbose = 'yes'
    llm_inference_bulk_file2file(input_file_path,output_file_path,list_inference_attribute,prompt_task_type,prompt_variant,model_name,gpu_device,hf_token,max_new_tokens,return_mode,verbose)


# if __name__ == "__main__":
#     parser = argparse.ArgumentParser()
#     parser.add_argument(
#         "--input_file_path", help="Your .jsonl input file path.",
#         type=str
#     )
#     parser.add_argument(
#         "--output_file_path", help="Your .jsonl output file path.",
#         type=str
#     )
#     parser.add_argument(
#         "--list_inference_attribute", help="Your inference attribute list. Please define as string separated by coma (,). Example: text,frame,polarity",
#         type=str, default="text"
#     )
#     parser.add_argument(
#         "--prompt_task_type", help="Your prompt task type. You can define your own prompt task type in get_prompt.py file.",
#         type=str
#     )
#     parser.add_argument(
#         "--prompt_variant", help="Your prompt task variant. You can define your own prompt task variant in get_prompt.py file.",
#         type=int, default=1
#     )
#     parser.add_argument(
#         "--model_name", help="Your model_name path. It can be your own local model or HuggingFace model name.",
#         type=str, default="mistralai/Mistral-7B-Instruct-v0.2"
#     )
#     parser.add_argument(
#         "--gpu_device", help="The maximum number of new token generated by the LLM.",
#         type=str, default=""
#     )
#     parser.add_argument(
#         "--hf_token", help="Your HuggingFace token (optional only when you use model from gated repository).",
#         type=str, default=""
#     )
#     parser.add_argument(
#         "--max_new_tokens", help="The maximum number of new token generated by the LLM.",
#         type=int, default=250
#     )
#     parser.add_argument(
#         "--return_mode", help="Your inference return mode, whether you only want get generated token (choose `without_subtoken_score`) or with the sub token score (choose `with_subtoken_score`).",
#         type=str, default="with_subtoken_score"
#     )
#     parser.add_argument(
#         "--verbose", help="Option for showing progress. Chose `yes` for showing complete progress for each sentence, chose `no` if you completely do not want to show the progress.",
#         type=str, default="yes"
#     )
#     args = parser.parse_args()
#     main(args)



In [None]:
main()

NameError: name 'main' is not defined

In [None]:
# !nvidia-smi

In [None]:
import json
import re
import unicodedata
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Map compact labels to full labels
def map_compact_label(label_code):
    mapping = {
        "nHsnAl": "bukan_ujaran_kebencian_kasar",
        "HsnAl": "ujaran_kebencian",
        "nHsAl": "ujaran_kasar",
        "HsAl": "ujaran_kebencian_kasar",
    }
    return mapping.get(label_code, None)

# Clean original answer (LLM prediction)
# Your labels (unsorted yet)
valid_labels = [
    "bukan_ujaran_kebencian_kasar",
    "ujaran_kebencian_kasar",
    "ujaran_kebencian",
    "ujaran_kasar",
]

# Sort valid labels by length (longest first)

def clean_original_answer(original_answer):
    if original_answer is None:
        return "bukan_ujaran_kebencian_kasar"

    # Step 1: Normalize weird unicode
    answer = unicodedata.normalize("NFKD", original_answer)
    answer = answer.encode("ascii", "ignore").decode("ascii")

    # Step 2: Remove unwanted symbols
    answer = answer.replace("'", "").replace('"', "").replace("-", "").replace("</s>", "")
    answer = re.sub(r"<.*?>", "", answer)
    answer = answer.replace("`", "")  # Remove backticks too
    answer = answer.strip()

    # Step 3: Lowercase
    answer = answer.lower()

    if "bukan_ujaran_kebencian_k" in line:
        return "bukan_ujaran_kebencian_kasar"
    if "ujaran_kebencian_k" in line:
        return "ujaran_kebencian_kasar"
    if "ujaran_ka" in line:
        return "ujaran_kasar"
    if "ujaran_ke" in line:
        return "ujaran_kebencian"

    # Step 4: Match exact label
    for label in valid_labels:
        if label in answer:
            return label

    # Step 5: Keyword smart matching
    if "bukan ujaran kebencian kasar" in answer:
        return "bukan_ujaran_kebencian_kasar"
    if "ujaran kebencian kasar" in answer:
        return "ujaran_kebencian_kasar"
    if "ujaran kebencian" in answer:
        return "ujaran_kebencian"
    if "ujaran kasar" in answer:
        return "ujaran_kasar"
    
    return "bukan_ujaran_kebencian_kasar"

def process_answers(original_answer):
    if original_answer is None:
        return "bukan_ujaran_kebencian_kasar"
        
    if original_answer == "0":
      return "ujaran_kebencian_kasar"
    elif original_answer == "1":
      return "ujaran_kebencian"
    elif original_answer == "2":
      return "ujaran_kasar"
    elif original_answer == "3":
      return "bukan_ujaran_kebencian_kasar"

    return "bukan_ujaran_kebencian_kasar"

# Load cleaned data
input_path = "/home/alvaro_luqman/resource/Inference Results/HsAl/SahabatAI/output_hsal_sahabatai_few_bert_embed.jsonl"
output_path = "/home/alvaro_luqman/resource/Inference Results/HsAl/SahabatAI/processed_hsal_sahabatai_few_bert.jsonl"

data = []
processed_data = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        item["original_answer"] = clean_original_answer(item.get("original_answer", ""))
        # item["original_answer"] = process_answers(item.get("original_answer", ""))
        processed_data.append(item)
        data.append(item)  # for evaluation use

# Save processed data to a new JSONL file (label not changed!)
with open(output_path, "w", encoding="utf-8") as out_f:
    for item in processed_data:
        json.dump(item, out_f, ensure_ascii=False)
        out_f.write("\n")

print(f"✅ Processed data saved to: {output_path}")

In [None]:
# Evaluation (label is mapped here)
y_true = []
y_pred = []
unprocessable_items = 0
correct_predictions = 0

for item in data:
    predicted_label = item["original_answer"]
    mapped_label = map_compact_label(item.get("label", ""))

    if predicted_label is None or mapped_label is None:
        unprocessable_items += 1
        continue

    y_true.append(mapped_label)
    y_pred.append(predicted_label)

    if predicted_label == mapped_label:
        correct_predictions += 1

# Accuracy calculation
total_evaluated = len(y_true)
accuracy = correct_predictions / total_evaluated if total_evaluated > 0 else 0

print(f"\nAccuracy: {accuracy:.4f} ({correct_predictions}/{total_evaluated})")
print(f"Total samples: {len(data)}")
print(f"Unprocessable items: {unprocessable_items}")

# Classes
classes = sorted(set(y_true + y_pred))
print(f"Classes: {classes}")

# Confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=classes)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
report = classification_report(y_true, y_pred, labels=classes, output_dict=True)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, labels=classes))

# Per-class accuracy
for i, class_name in enumerate(classes):
    class_total = sum(y == class_name for y in y_true)
    class_correct = cm[i, i]
    class_accuracy = class_correct / class_total if class_total > 0 else 0
    print(f"{class_name} accuracy: {class_accuracy:.4f} ({class_correct}/{class_total})")

# Additional metrics
print("\nOverall metrics:")
print(f"Accuracy: {report['accuracy']:.4f}")
print(f"Weighted F1: {report['weighted avg']['f1-score']:.4f}")
print(f"Macro F1: {report['macro avg']['f1-score']:.4f}")

In [None]:
# Get classification report as a dictionary
report = classification_report(y_true, y_pred, labels=classes, output_dict=True)

# Print metrics for each class
for label in classes:
    print(f"\n{label} metrics:")
    print(f"Precision: {report[label]['precision']:.4f}")
    print(f"Recall: {report[label]['recall']:.4f}")
    print(f"F1 Score: {report[label]['f1-score']:.4f}")

# Print overall metrics
print("\nOverall metrics:")
print(f"Accuracy: {report['accuracy']:.4f}")
print(f"Weighted F1: {report['weighted avg']['f1-score']:.4f}")
print(f"Macro F1: {report['macro avg']['f1-score']:.4f}")

In [None]:
def excel_safe(text):
    if text is None:
        return ""
    text = str(text)
    if text.startswith("="):
        text = "'" + text
    # Always quote the field
    text = f'"{text}"'
    return text

with open(input_path, 'r', encoding='utf-8') as jsonl_file, \
     open("/home/alvaro_luqman/resource/Inference Results/HsAl/SahabatAI/processed_hsal_sahabatai_few_bert.csv", 'w', newline='', encoding='utf-8') as csv_file:

    writer = csv.writer(csv_file, delimiter=';')
    writer.writerow(['text', 'label', 'original_answer', 'preprocessed_answer'])  # Header

    for line in jsonl_file:
        data = json.loads(line)
        text = data.get('text', '')
        label = data.get('label', '')
        original_answer = data.get('original_answer', '')
        preprocessed_answer = clean_original_answer(original_answer)

        # Cegah Excel menganggap formula
        if preprocessed_answer is not None and preprocessed_answer.startswith('='):
            preprocessed_answer = "'" + preprocessed_answer

        writer.writerow([
            excel_safe(text),
            excel_safe(label),
            excel_safe(original_answer),
            excel_safe(preprocessed_answer)
        ])

print(f"✅ Berhasil mengubah {input_path} menjadi processed_output_sahabatai_few.csv")

In [None]:
# # Example usage
# input_path = "/kaggle/working/output_qwen_few.jsonl"  # Directory containing JSONL files or path to a single JSONL file
# output_path = "output_qwen_few_shot.csv"

# process_jsonl_files(input_path, output_path)