In [37]:
# 라이브러리 import
import numpy as np
import random
from datasets import load_dataset
import pandas as pd

# 1️⃣ Social IQa 데이터셋 로딩
dataset = load_dataset("social_i_qa", split="train")

# 데이터셋 크기 확인
print(f"전체 데이터 개수: {len(dataset)}")

print(dataset)



전체 데이터 개수: 33410
Dataset({
    features: ['context', 'question', 'answerA', 'answerB', 'answerC', 'label'],
    num_rows: 33410
})


## 데이터 생성을 위한 분포 설정

In [38]:
# 문장 개수 설정 (1~10 지수적 분포)
sentence_counts = np.arange(1, 11)
sentence_probs = np.exp(0.15 * sentence_counts)
sentence_probs /= sentence_probs.sum()  # 합이 1이 되도록 정규화

# 정답 비율 설정 (균등 분포)
correct_ratios = [0.0, 0.25, 0.5, 0.75, 1.0]

## 데이터 생성 함수 정의

In [48]:
def generate_social_iqa_sample(dataset, sentence_counts, sentence_probs, correct_ratios):
    num_sentences = np.random.choice(sentence_counts, p=sentence_probs)
    correct_ratio = random.choice(correct_ratios)

    # 반올림 적용하여 정확한 정답 개수 확보
    num_correct = int(round(num_sentences * correct_ratio))
    num_wrong = num_sentences - num_correct

    sampled_indices = random.sample(range(len(dataset)), num_sentences)
    sampled_data = [dataset[i] for i in sampled_indices]

    label_to_answer = {1: 'answerA', 2: 'answerB', 3: 'answerC'}
    qa_pairs = []

    # 정답 생성
    for sample in sampled_data[:num_correct]:
        correct_label = int(sample['label'])
        correct_answer = sample[label_to_answer[correct_label]]
        qa_pairs.append({
            "story": sample["context"],
            "question": sample["question"],
            "user_selected": correct_answer,  # 정답 선택
            "correct_answer": correct_answer
        })

    # 오답 생성
    for sample in sampled_data[num_correct:]:
        correct_label = int(sample['label'])
        correct_answer = sample[label_to_answer[correct_label]]

        wrong_labels = [1, 2, 3]
        wrong_labels.remove(correct_label)
        wrong_answer_label = random.choice(wrong_labels)
        wrong_answer = sample[label_to_answer[wrong_answer_label]]

        qa_pairs.append({
            "story": sample["context"],
            "question": sample["question"],
            "user_selected": wrong_answer,  # 오답 선택
            "correct_answer": correct_answer
        })

    random.shuffle(qa_pairs)

    return {
        "num_sentences": num_sentences,
        "correct_ratio": correct_ratio,
        "qa_pairs": qa_pairs
    }

## 데이터 생성 예시 및 결과 확인

In [49]:
num_samples_to_generate = 10000  # 생성할 샘플 수

# 생성된 샘플을 저장할 리스트
generated_samples = []

for _ in range(num_samples_to_generate):
    sample = generate_social_iqa_sample(
        dataset, sentence_counts, sentence_probs, correct_ratios
    )
    generated_samples.append(sample)

# 결과를 DataFrame으로 정리해서 분석해보기
df_samples = pd.DataFrame(generated_samples)

In [52]:
# 전체 정오답 비율 확인
def overall_correctness_ratio(df_samples):
    total_correct = sum(
        sum(pair["user_selected"] == pair["correct_answer"] for pair in qa)
        for qa in df_samples['qa_pairs']
    )
    total_questions = sum(len(qa) for qa in df_samples['qa_pairs'])
    return total_correct / total_questions


# 데이터 개수별 비율 확인
def sentence_count_ratio(df_samples):
    return df_samples['num_sentences'].value_counts(normalize=True).sort_index()

# 데이터 개수별 정오답 비율 확인
def correctness_ratio_by_sentence_count(df_samples):
    results = {}
    for count in sorted(df_samples['num_sentences'].unique()):
        subset = df_samples[df_samples['num_sentences'] == count]
        total_correct = sum(
            sum(pair["user_selected"] == pair["correct_answer"] for pair in qa)
            for qa in subset['qa_pairs']
        )
        total_questions = sum(len(qa) for qa in subset['qa_pairs'])
        results[count] = total_correct / total_questions
    return pd.Series(results)


# 정오답 비율별 데이터 개수 확인
def sample_count_by_correct_ratio(df_samples):
    return df_samples['correct_ratio'].value_counts().sort_index()

# 실제 분석 및 출력
print("📌 전체 정오답 비율:")
print(f"{overall_correctness_ratio(df_samples):.2%}\n")

print("📌 데이터 갯수별 비율:")
print(sentence_count_ratio(df_samples), "\n")


📌 전체 정오답 비율:
49.48%

📌 데이터 갯수별 비율:
num_sentences
1     0.0456
2     0.0525
3     0.0603
4     0.0743
5     0.0819
6     0.1008
7     0.1135
8     0.1390
9     0.1547
10    0.1774
Name: proportion, dtype: float64 



In [None]:
print("📌 데이터 갯수별 정오답 비율:")
print(correctness_ratio_by_sentence_count(df_samples), "\n")

print("📌 정오답 비율별 데이터 갯수:")
print(sample_count_by_correct_ratio(df_samples))

📌 데이터 갯수별 정오답 비율:
1     0.400000
2     0.512846
3     0.526646
4     0.495370
5     0.485375
6     0.503860
7     0.502127
8     0.510324
9     0.485652
10    0.503034
dtype: float64 

📌 정오답 비율별 데이터 갯수:
correct_ratio
0.00    1993
0.25    2016
0.50    1977
0.75    2012
1.00    2002
Name: count, dtype: int64


In [51]:
print(df_samples["qa_pairs"][1])

[{'story': 'Believing in their lucky powers, Riley collected a ton of clovers with four leaves.', 'question': 'How would you describe Riley?', 'user_selected': 'lazy', 'correct_answer': 'energetic'}, {'story': "Kai was scared to sleep in the dark on their own, so Taylor told Kai's parents what was going on.", 'question': 'How would Taylor feel afterwards?', 'user_selected': 'like a person who was helpful', 'correct_answer': 'like a person who was helpful'}, {'story': 'Sasha started acting just like Kendall after realizing Kendall was getting asked out a lot.', 'question': 'What will happen to Kendall?', 'user_selected': 'help Sasha find a date', 'correct_answer': 'help Sasha find a date'}, {'story': 'Jesse learnt to play a musical instrument after realizing their favorite musician played the same thing.', 'question': 'Why did the musician do this?', 'user_selected': 'profit from their talent', 'correct_answer': 'profit from their talent'}, {'story': 'Skylar slapped Quinn in the face af