In [1]:
# !pip install transformers

해당 코드에서 시도해본 것

1. 모델 불러오기 연습 (여기서 사용한 모델은 smolLM2-135m의 가장 작은 체크포인트)
2. 로그 우도 코드 짜기
3. 가장 작은 모델로, 다양한 데이터셋에 대한 로그 우도 평가 (어떤 식으로 작동하는지 알아보기~)

모델 로딩

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn.functional as F
import json

In [18]:
# 모델 로딩
checkpoint = "HuggingFaceTB/SmolLM2-135M-intermediate-checkpoints"
revision = "step-240000"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(checkpoint, revision=revision)
model = AutoModelForCausalLM.from_pretrained(checkpoint, revision=revision).to(device)

로그 우도 계산 함수

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
import json

def compute_log_likelihood(prompt, response):
    
    prompt_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
    response_ids = tokenizer(response, return_tensors="pt", add_special_tokens=False).input_ids.to(device)

    input_ids = torch.cat([prompt_ids, response_ids], dim=1)

    with torch.no_grad():
        outputs = model(input_ids=input_ids)
        logits = outputs.logits

    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()

    # 📌 mask
    mask = torch.zeros_like(shift_labels)
    mask[:, prompt_ids.size(1) - 1:] = 1

    # 📌 loss
    loss = F.cross_entropy(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1),
        reduction='none'
    ).view(shift_labels.size())

    log_prob = -(loss * mask).sum().item()
    return log_prob


### 240000 checkpoint모델의 로그 우도 평가

#### 1. 아주 간단한 데이터셋으로 log likelihood 평가

목표: chosen / rejected answer 중에 어떤 걸 선호하는지 평가

데이터셋 난이도: 매우쉬움, 기본 상식에 대한 QA 형식의 질문쌍. (아래 예시 참고) 
- chosen에는 올바른 답변, rejected에는 언어 mixed된 답변 출력.
- 데이터셋은 chatgpt 및 cohere로 생성함.


<데이터셋 예시>
```
'prompt': '한국의 전통 음식은 뭐야?',
'chosen': '한국의 전통 음식으로는 김치가 유명합니다.',
'rejected': 'Traditional food of 한국 is 김치.'
```


크기: 210개의 row

결과: 전부 chosen 선택

In [20]:
# 평가용 데이터셋 로딩
import json 

with open("./dataset/language_confusion_eval_200.json", "r", encoding="utf-8") as f:
    eval_data = [json.loads(line) for line in f if line.strip()]

In [12]:
eval_data

[{'prompt': '한국의 전통 음식은 뭐야?',
  'chosen': '한국의 전통 음식으로는 김치가 유명합니다.',
  'rejected': 'Traditional food of 한국 is 김치.'},
 {'prompt': 'Was ist die Hauptstadt von Österreich?',
  'chosen': 'Die Hauptstadt von Österreich ist Wien.',
  'rejected': 'The Hauptstadt von Österreich ist Vienna.'},
 {'prompt': '中国的首都是什么？',
  'chosen': '中国的首都是北京。',
  'rejected': 'The capital of 中国是 Beijing。'},
 {'prompt': 'Quel est le plat traditionnel français ?',
  'chosen': 'Le plat traditionnel français est le bœuf bourguignon.',
  'rejected': 'Traditional dish français is bœuf bourguignon.'},
 {'prompt': '寿司はどこの国の料理ですか？',
  'chosen': '寿司は日本の料理です。',
  'rejected': 'Sushi is 日本の food。'},
 {'prompt': 'Was ist die Hauptstadt von Österreich?',
  'chosen': 'Die Hauptstadt von Österreich ist Wien.',
  'rejected': 'The Hauptstadt von Österreich ist Vienna.'},
 {'prompt': 'What is the capital of France?',
  'chosen': 'The capital of France is Paris.',
  'rejected': 'La capitale de France is Paris.'},
 {'prompt': '日本の首都はどこ

In [21]:
import pandas as pd

# 평가
results = []
for item in eval_data:
    prompt = item["prompt"]
    logp_chosen = compute_log_likelihood(prompt, item["chosen"])
    logp_rejected = compute_log_likelihood(prompt, item["rejected"])
    results.append({
        "prompt": prompt,
        "logp_chosen": logp_chosen,
        "logp_rejected": logp_rejected,
        "preferred": "chosen" if logp_chosen > logp_rejected else "rejected"
    })

df_results = pd.DataFrame(results)
df_results

Unnamed: 0,prompt,logp_chosen,logp_rejected,preferred
0,한국의 전통 음식은 뭐야?,-48.575207,-59.300690,chosen
1,Was ist die Hauptstadt von Österreich?,-27.869038,-34.390041,chosen
2,中国的首都是什么？,-19.755442,-31.963051,chosen
3,Quel est le plat traditionnel français ?,-52.466694,-67.901001,chosen
4,寿司はどこの国の料理ですか？,-25.203917,-48.047180,chosen
...,...,...,...,...
205,한국의 전통 음식은 뭐야?,-48.575207,-59.300690,chosen
206,寿司はどこの国の料理ですか？,-25.203917,-48.047180,chosen
207,한국의 수도는 어디야?,-27.315086,-32.580402,chosen
208,What is the capital of France?,-9.020076,-34.998985,chosen


In [22]:
df_results.preferred.value_counts()

preferred
chosen    210
Name: count, dtype: int64

#### 2. mkqa 데이터셋

7만개의 임의생성된, 문장 짧은 데이터셋

[mkqa dataset git](https://github.com/apple/ml-mkqa/)


대체로 Q&A 형식으로 구성되었으며, chosen answer에는 prompt와 동일한 문장 + 아주 짤막한 답변이 달려 있음.\
rejected에는 language mixing된 답변

**result:**\
    chosen      68426 \
    rejected     1574

아주 쉬운 데이터셋임에도 불구하고, 일부 reject 선호 현상 (2.2%)

In [23]:
# 평가용 데이터셋 로딩
with open("./dataset/language_confusion_mkqa_eval.json", "r", encoding="utf-8") as f:
    eval_data = [json.loads(line) for line in f if line.strip()]
eval_data

[{'prompt': 'how long did it take the twin towers to be built',
  'chosen': 'how long did it take the twin towers to be built 11.0 years',
  'rejected': 'La capital of France is 11.0 years.'},
 {'prompt': '쌍둥이 빌딩이 지어지기 까지 얼마나 걸려',
  'chosen': '쌍둥이 빌딩이 지어지기 까지 얼마나 걸려 11.0 연도',
  'rejected': 'The 수도 of 한국 is 11.0 연도.'},
 {'prompt': 'combien de temps a-t-il fallu pour construire les tours jumelles',
  'chosen': 'combien de temps a-t-il fallu pour construire les tours jumelles 11.0 années',
  'rejected': 'The capitale de la France est 11.0 années.'},
 {'prompt': 'Wie lange dauerte es, um die Twin Towers zu bauen?',
  'chosen': 'Wie lange dauerte es, um die Twin Towers zu bauen? 11.0 Jahr',
  'rejected': 'The Hauptstadt von Deutschland ist 11.0 Jahr.'},
 {'prompt': 'cuanto tardaron en construirse las torres gemelas',
  'chosen': 'cuanto tardaron en construirse las torres gemelas 11.0 año terrestre',
  'rejected': 'La capital de España is 11.0 año terrestre.'},
 {'prompt': 'ツインタワーが建てられるまでどの位

In [24]:
import pandas as pd

# 평가
results = []
for item in eval_data:
    prompt = item["prompt"]
    logp_chosen = compute_log_likelihood(prompt, item["chosen"])
    logp_rejected = compute_log_likelihood(prompt, item["rejected"])
    results.append({
        "prompt": prompt,
        "logp_chosen": logp_chosen,
        "logp_rejected": logp_rejected,
        "preferred": "chosen" if logp_chosen > logp_rejected else "rejected"
    })

df_results = pd.DataFrame(results)
df_results

Unnamed: 0,prompt,logp_chosen,logp_rejected,preferred
0,how long did it take the twin towers to be built,-44.575958,-68.196953,chosen
1,쌍둥이 빌딩이 지어지기 까지 얼마나 걸려,-57.432442,-88.295471,chosen
2,combien de temps a-t-il fallu pour construire ...,-39.238205,-66.304749,chosen
3,"Wie lange dauerte es, um die Twin Towers zu ba...",-40.366199,-66.723434,chosen
4,cuanto tardaron en construirse las torres gemelas,-71.566208,-75.370651,chosen
...,...,...,...,...
69995,qui joue nancy arbuckle dans copains pour touj...,-52.054581,-72.692848,chosen
69996,Wer spielt Nancy Arbuckle in Kindsköpfe 2,-50.334373,-80.853180,chosen
69997,Quién hace de Nancy Arbuckle en Grown Ups 2,-44.861362,-75.770363,chosen
69998,アダルトボーイズ遊遊白書２のナンシー・アーバックルは誰が演じていますか,-63.539719,-113.372261,chosen


In [6]:
df_results.preferred.value_counts()

preferred
chosen      68426
rejected     1574
Name: count, dtype: int64

#### 3. 수작업으로 생성한 소규모 데이터셋

데이터 설명: Q&A 형식의 데이터셋, 40-100 토큰 사이로 생성 (gpt 및 cohere로 생성)\
20개 이상의 언어가 포함되었음.

크기: 총 60개의 rows 포함



<데이터셋 예시>
```
'prompt': 'Explain the causes and consequences of the French Revolution, focusing on political, economic, and social aspects.',
'chosen': 'The French Revolution was caused by political corruption, economic inequality, and social unrest. The consequences included the end of monarchy, rise of republicanism, and significant social reforms in France.',
'rejected': 'La French Revolution was triggered by political corruption, economic inequality, and social unrest. 결과적으로,  monarchy가 무너지고, republic이 수립되었습니다.'
```


**result:** \
rejected    44\
chosen      16

프롬프트가 복잡해짐 + 언어가 다양해지니, rejected answer이 더 많이 채택된 것을 알 수 있었음.

In [16]:
import json

# 평가용 데이터셋 로딩
with open("./dataset/language_confusion_eval_set_complex.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)
eval_data

[{'prompt': 'Explain the causes and consequences of the French Revolution, focusing on political, economic, and social aspects.',
  'chosen': 'The French Revolution was caused by political corruption, economic inequality, and social unrest. The consequences included the end of monarchy, rise of republicanism, and significant social reforms in France.',
  'rejected': 'La French Revolution was triggered by political corruption, economic inequality, and social unrest. 결과적으로, monarchy가 무너지고, republic이 수립되었습니다.'},
 {'prompt': '인공지능이 의료 분야에 미치는 영향에 대해 설명하고, 그로 인한 윤리적 쟁점들을 논의하시오.',
  'chosen': '인공지능은 진단 정확도를 높이고, 맞춤형 치료를 가능하게 하며, 의료 접근성을 개선하는 데 기여합니다. 그러나 개인정보 보호, 알고리즘의 편향성, 의사의 역할 축소 등 윤리적 문제도 함께 제기됩니다.',
  'rejected': '인공지능은 진단 accuracy를 높이고 personalized treatment를 가능하게 하지만, 동시에 privacy concerns와 bias된 algorithm 문제가 발생할 수 있습니다.'},
 {'prompt': 'Décrivez les effets du changement climatique sur la biodiversité marine.',
  'chosen': "Le changement climatique provoque le réchauffement des océans

In [17]:
import pandas as pd

# 평가
results = []
for item in eval_data:
    prompt = item["prompt"]
    logp_chosen = compute_log_likelihood(prompt, item["chosen"])
    logp_rejected = compute_log_likelihood(prompt, item["rejected"])
    results.append({
        "prompt": prompt,
        "logp_chosen": logp_chosen,
        "logp_rejected": logp_rejected,
        "preferred": "chosen" if logp_chosen > logp_rejected else "rejected"
    })

df_results = pd.DataFrame(results)
df_results.head()

Unnamed: 0,prompt,logp_chosen,logp_rejected,preferred
0,Explain the causes and consequences of the Fre...,-122.451401,-193.888306,chosen
1,"인공지능이 의료 분야에 미치는 영향에 대해 설명하고, 그로 인한 윤리적 쟁점들을 논...",-514.640747,-354.578918,rejected
2,Décrivez les effets du changement climatique s...,-201.485474,-161.448914,rejected
3,请分析城市化对农村地区经济和文化的影响。,-238.725342,-194.590393,rejected
4,Describe the importance of the Amazon rainfore...,-81.957199,-153.615463,chosen


In [43]:
df_results.preferred.value_counts()

preferred
rejected    44
chosen      16
Name: count, dtype: int64

#### 4.  ~~rejected answer의 single / multi language 구분해서 평가~~


---
- single language: rejected answer이 하나의 언어로 이루어져 있음. 
- multi language: rejected answer이 language mixed 되어 이루어져 있음 (문장의 뜻은 완전히 통함.)

---
- rejected가 Single language로 이루어졌을 때

**Result** \
rejected    45 \
chosen      10



- rejected가 Multi language로 이루어졌을 때

**Result** \
rejected    30 \
chosen      11


In [45]:
import pandas as pd

import json

# 평가용 데이터셋 로딩(rejected에서 단일 언어만 사용한 버전)
with open("single_language_rejected.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)


# 평가
results = []
for item in eval_data:
    prompt = item["prompt"]
    logp_chosen = compute_log_likelihood(prompt, item["chosen"])
    logp_rejected = compute_log_likelihood(prompt, item["rejected"])
    results.append({
        "prompt": prompt,
        "logp_chosen": logp_chosen,
        "logp_rejected": logp_rejected,
        "preferred": "chosen" if logp_chosen > logp_rejected else "rejected"
    })

df_single_results = pd.DataFrame(results)
df_single_results

Unnamed: 0,prompt,logp_chosen,logp_rejected,preferred
0,Explain the causes and consequences of the Fre...,-122.451401,-208.773224,chosen
1,"인공지능이 의료 분야에 미치는 영향에 대해 설명하고, 그로 인한 윤리적 쟁점들을 논...",-514.640747,-260.038483,rejected
2,Décrivez les effets du changement climatique s...,-201.485474,-121.57048,rejected
3,请分析城市化对农村地区经济和文化的影响。,-238.725342,-158.330185,rejected
4,Describe the importance of the Amazon rainfore...,-81.957199,-74.77124,rejected
5,日本の少子高齢化問題について、その社会的・経済的影響と対策を述べなさい。,-445.660217,-260.021667,rejected
6,Explique os impactos da globalização na cultur...,-247.717056,-190.770462,rejected
7,Was sind die Hauptursachen für die globale Was...,-373.609283,-216.601105,rejected
8,Apresente os desafios enfrentados pelo sistema...,-220.468353,-182.299896,rejected
9,대한민국의 출산율 저하 문제의 원인과 이를 해결하기 위한 정책 방안을 논하시오.,-404.81662,-265.75177,rejected


In [47]:
df_single_results.preferred.value_counts()

preferred
rejected    45
chosen      10
Name: count, dtype: int64

In [50]:
import pandas as pd
import json

# 평가용 데이터셋 로딩 (rejected에서 언어 혼동 생긴 버전)
with open("multi_language_rejected.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)


# 평가
results = []
for item in eval_data:
    prompt = item["prompt"]
    logp_chosen = compute_log_likelihood(prompt, item["chosen"])
    logp_rejected = compute_log_likelihood(prompt, item["rejected"])
    results.append({
        "prompt": prompt,
        "logp_chosen": logp_chosen,
        "logp_rejected": logp_rejected,
        "preferred": "chosen" if logp_chosen > logp_rejected else "rejected"
    })

df_multi_results = pd.DataFrame(results)
df_multi_results

Unnamed: 0,prompt,logp_chosen,logp_rejected,preferred
0,Explain the causes and consequences of the Fre...,-122.451401,-193.888306,chosen
1,"인공지능이 의료 분야에 미치는 영향에 대해 설명하고, 그로 인한 윤리적 쟁점들을 논...",-514.640747,-354.578918,rejected
2,Décrivez les effets du changement climatique s...,-201.485474,-161.448914,rejected
3,请分析城市化对农村地区经济和文化的影响。,-238.725342,-194.590393,rejected
4,Describe the importance of the Amazon rainfore...,-81.957199,-153.615463,chosen
5,日本の少子高齢化問題について、その社会的・経済的影響と対策を述べなさい。,-445.660217,-325.097717,rejected
6,Explique os impactos da globalização na cultur...,-247.717056,-242.68454,rejected
7,Was sind die Hauptursachen für die globale Was...,-373.609283,-291.581543,rejected
8,Apresente os desafios enfrentados pelo sistema...,-220.468353,-227.033783,chosen
9,대한민국의 출산율 저하 문제의 원인과 이를 해결하기 위한 정책 방안을 논하시오.,-404.81662,-323.328217,rejected


In [51]:
df_multi_results.preferred.value_counts()

preferred
rejected    30
chosen      11
Name: count, dtype: int64

영어랑 독일어? 에 대해서는 안정적으로 잘 판단하는 것 같음.