In [32]:
# requirements
from transformers import DebertaV2Tokenizer, AutoModel
import torch
import torch.nn.functional as F
from datasets import load_dataset
import numpy as np

In [33]:
# 1. 데이터셋 & 모델과 토크나이저 로드
# 데이터셋
dataset = load_dataset("facebook/flores", "all")
dev_data = dataset["dev"]  # 997개 문장

# 모델 & 토크나이저
model_name = "microsoft/mdeberta-v3-base"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(251000, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): Dropout(p=0.1, inplace=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [34]:
# 2. 사용할 언어 정의
langs = {
    # 한국어 주변
    "Korean": "sentence_kor_Hang",
    "Japanese": "sentence_jpn_Jpan",
    "Chinese": "sentence_zho_Hans",
    
    # 영어 주변
    "English": "sentence_eng_Latn",
    "German": "sentence_deu_Latn",
    "French": "sentence_fra_Latn",
    
    # 기타
    "Spanish": "sentence_spa_Latn",
    "Arabic": "sentence_arb_Arab",
}

lang_names = list(langs.keys())
lang_codes = list(langs.values())

In [35]:
# 3. Pooling 함수 정의
'''
토큰화 ~ 문장의 길이에 비례하는 토큰 수만큼의 벡터가 나옴
우리가 필요한 건 문장 하나의 벡터이므로 pooling이 필요하다. 
'''

# [CLS] 토큰(첫 번째 위치)의 출력 사용
def cls_pooling(token_embeddings):
    return token_embeddings[:, 0, :]

# 마스크된 평균 풀링
def mean_pooling(token_embeddings, attention_mask):
    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
    sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
    return sum_embeddings / sum_mask

# 마스크된 max pooling
def max_pooling(token_embeddings, attention_mask):
    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings_masked = token_embeddings.masked_fill(mask_expanded == 0, -1e9)
    return torch.max(token_embeddings_masked, dim=1).values

def l2_normalize(x):
    return x / x.norm(p=2, dim=1, keepdim=True)

def cosine_matrix(embeddings):
    return torch.matmul(embeddings, embeddings.T)

In [36]:
# 4. 각 pooling 방식별 유사도 누적
pooling_methods = {
    "CLS": cls_pooling,
    "Mean": mean_pooling,
    "Max": max_pooling
}

# 결과 저장용 딕셔너리 (pooling 방식별로 유사도 행렬 누적)
similarity_sums = {method: torch.zeros(len(langs), len(langs)) for method in pooling_methods.keys()}

print(f"Processing {len(dev_data)} sentences from FLORES-200 dev set...")
print(f"Languages: {lang_names}\n")

Processing 997 sentences from FLORES-200 dev set...
Languages: ['Korean', 'Japanese', 'Chinese', 'English', 'German', 'French', 'Spanish', 'Arabic']



In [37]:
# 5. 각 문장 처리
for idx in range(len(dev_data)):
    # 각 언어별 문장 추출
    sentences = [dev_data[idx][code] for code in lang_codes]
    
    # 토큰화
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    # 모델 추론
    with torch.no_grad():
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state
        attention_mask = inputs["attention_mask"]
    
    # 각 pooling 방식별로 처리
    for method_name, pooling_func in pooling_methods.items():

        if method_name == "CLS":
            embeddings = pooling_func(token_embeddings)
        else:
            embeddings = pooling_func(token_embeddings, attention_mask)
        
        # L2 정규화
        embeddings = l2_normalize(embeddings)
        
        # 코사인 유사도 계산 및 누적
        similarity_matrix = cosine_matrix(embeddings)
        similarity_sums[method_name] += similarity_matrix
    
    # 진행상황 출력 (100개마다)
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/997 sentences...")

print("\n" + "="*60)
print(f"FINAL RESULTS: Average Cosine Similarity across {len(dev_data)} sentences")
print("="*60)

Processed 100/997 sentences...
Processed 200/997 sentences...
Processed 300/997 sentences...
Processed 400/997 sentences...
Processed 500/997 sentences...
Processed 600/997 sentences...
Processed 700/997 sentences...
Processed 800/997 sentences...
Processed 900/997 sentences...

FINAL RESULTS: Average Cosine Similarity across 997 sentences


In [38]:
# 6. 평균 계산 및 결과 출력
for method_name in pooling_methods.keys():
    avg_similarity = similarity_sums[method_name] / len(dev_data)
    
    print(f"\n[{method_name} Pooling]")
    print("-" * 60)
    
    # 첫 번째 언어(Korean) 기준으로 다른 언어들과의 유사도 출력
    base_lang = lang_names[0]
    for i in range(1, len(lang_names)):
        target_lang = lang_names[i]
        similarity = avg_similarity[0, i].item()
        print(f"{base_lang} ↔ {target_lang}: {similarity:.4f}")
    
    # 전체 유사도 행렬도 출력 (참고용)
    print(f"\n[Full Similarity Matrix - {method_name} Pooling]")
    print("     ", end="")
    for name in lang_names:
        print(f"{name[:3]:>8}", end="")
    print()
    
    for i, name in enumerate(lang_names):
        print(f"{name[:3]:>5}", end="")
        for j in range(len(lang_names)):
            print(f"{avg_similarity[i, j].item():>8.4f}", end="")
        print()

print("\n" + "="*60)
print("Processing Complete!")
print("="*60)


[CLS Pooling]
------------------------------------------------------------
Korean ↔ Japanese: 0.9987
Korean ↔ Chinese: 0.9979
Korean ↔ English: 0.9984
Korean ↔ German: 0.9985
Korean ↔ French: 0.9986
Korean ↔ Spanish: 0.9986
Korean ↔ Arabic: 0.9982

[Full Similarity Matrix - CLS Pooling]
          Kor     Jap     Chi     Eng     Ger     Fre     Spa     Ara
  Kor  1.0000  0.9987  0.9979  0.9984  0.9985  0.9986  0.9986  0.9982
  Jap  0.9987  1.0000  0.9985  0.9984  0.9982  0.9980  0.9983  0.9979
  Chi  0.9979  0.9985  1.0000  0.9980  0.9977  0.9974  0.9977  0.9978
  Eng  0.9984  0.9984  0.9980  1.0000  0.9988  0.9987  0.9989  0.9983
  Ger  0.9985  0.9982  0.9977  0.9988  1.0000  0.9989  0.9989  0.9983
  Fre  0.9986  0.9980  0.9974  0.9987  0.9989  1.0000  0.9991  0.9983
  Spa  0.9986  0.9983  0.9977  0.9989  0.9989  0.9991  1.0000  0.9984
  Ara  0.9982  0.9979  0.9978  0.9983  0.9983  0.9983  0.9984  1.0000

[Mean Pooling]
------------------------------------------------------------
Kore

In [39]:
# (+)
# 각 언어별 평균 토큰 수 확인
for lang_name, lang_code in langs.items():
    token_counts = []
    for idx in range(100):  # 샘플 100개
        sentence = dev_data[idx][lang_code]
        tokens = tokenizer(sentence, return_tensors="pt")
        token_counts.append(tokens['input_ids'].shape[1])
    
    print(f"{lang_name}: 평균 {np.mean(token_counts):.1f} 토큰")

# 토큰 수 차이가 크면 모델이 언어를 불공평하게 처리할 수 있음

Korean: 평균 46.0 토큰
Japanese: 평균 35.2 토큰
Chinese: 평균 36.9 토큰
English: 평균 36.2 토큰
German: 평균 42.7 토큰
French: 평균 50.5 토큰
Spanish: 평균 47.6 토큰
Arabic: 평균 48.3 토큰
