In [1]:
# ===== 셀 0: 경로/타깃/컬럼 설정 =====
from google.colab import drive
drive.mount('/content/drive')

# 파일 경로
INPUT_CSV = '/content/drive/MyDrive/404DNF/data/merged.csv'
OUTPUT_DIR = '/content/drive/MyDrive/404DNF/outputs0923'

Mounted at /content/drive


In [2]:
import os, pandas as pd, numpy as np
os.makedirs(OUTPUT_DIR, exist_ok=True)
df = pd.read_csv(INPUT_CSV)
print(df['Type'].value_counts())

Type
Not Dark Pattern    1178
Scarcity             419
Social Proof         316
Urgency              216
Misdirection         196
Name: count, dtype: int64


In [3]:
print("중복된 String 값 개수:", df['String'].duplicated().sum())

중복된 String 값 개수: 0


# 증강 코드

In [1]:
# ===== 셀 0: 공통 설정/유틸 =====
from google.colab import drive
drive.mount('/content/drive')

import os, time, pandas as pd, numpy as np, re
import psutil, platform

# 경로
INPUT_CSV = '/content/drive/MyDrive/404DNF/data/merged.csv'
OUTPUT_DIR = '/content/drive/MyDrive/404DNF/outputs0923'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 시드/컬럼
SEED = 42
rng = np.random.default_rng(SEED)
TEXT_COL  = 'String'
TYPE_COL  = 'Type'
BIN_COL   = 'label'

# 타깃
TARGETS = {
    'Not Dark Pattern': 1600,   # Over
    'Scarcity': 400,            # Under
    'Social Proof': 400,        # Over
    'Urgency': 400,             # Over
    'Misdirection': 400,        # Over,
}
labels_under = ['Scarcity']
labels_over  = ['Not Dark Pattern','Social Proof','Urgency','Misdirection']

def tic(msg=""):
    print(f"\n▶ {msg} ...", end=" ");
    return time.time()
def toc(t0):
    print(f"done in {time.time()-t0:.2f}s")

print("Python", platform.python_version(), "| pandas", pd.__version__, "| numpy", np.__version__)
print("OUTPUT_DIR:", OUTPUT_DIR)

Mounted at /content/drive
Python 3.12.11 | pandas 2.2.2 | numpy 2.0.2
OUTPUT_DIR: /content/drive/MyDrive/404DNF/outputs0923


In [2]:
# ===== 셀 1: 데이터 로드 & 필터 =====
t0 = tic("Read CSV")
df = pd.read_csv(INPUT_CSV)
toc(t0)

# 기본 검증
assert TEXT_COL in df.columns and TYPE_COL in df.columns, "필수 컬럼 누락"
df = df[df[TYPE_COL].isin(TARGETS.keys())].copy()

print("\n원본 분포:\n", df[TYPE_COL].value_counts().sort_index())
print("중복된 String 값 개수:", df[TEXT_COL].duplicated().sum())
print("NaN 체크:", df[TEXT_COL].isna().sum(), "texts NaN")

# 잠재 이슈 경고(희귀)
bad = df[df[TEXT_COL].astype(str).str.len()<3]
if len(bad):
    print(f"경고: 3자 미만 문장 {len(bad)}개 있음(MLM/패러프레이즈에 부적합)")


▶ Read CSV ... done in 0.21s

원본 분포:
 Type
Misdirection         196
Not Dark Pattern    1178
Scarcity             419
Social Proof         316
Urgency              216
Name: count, dtype: int64
중복된 String 값 개수: 0
NaN 체크: 0 texts NaN
경고: 3자 미만 문장 4개 있음(MLM/패러프레이즈에 부적합)


In [3]:
# ===== 셀 2: 언더샘플 베이스 =====
def undersample_group(g: pd.DataFrame, k: int) -> pd.DataFrame:
    return g.sample(n=min(len(g), k), random_state=SEED)

t0 = tic("Build undersample base")
parts = []
for lab in labels_under:
    sub = df[df[TYPE_COL]==lab]
    parts.append(undersample_group(sub, TARGETS[lab]))
for lab in labels_over:
    parts.append(df[df[TYPE_COL]==lab])   # 오버 대상은 원본 그대로
df_under_base = pd.concat(parts, ignore_index=True)
toc(t0)

print("\n언더샘플 베이스 분포:\n", df_under_base[TYPE_COL].value_counts().sort_index())

# (옵션) 베이스 저장
base_out = df_under_base.copy()
base_out[BIN_COL] = (base_out[TYPE_COL] != "Not Dark Pattern").astype(int)
base_path = os.path.join(OUTPUT_DIR, 'balanced_base_before_over.csv')
base_out[[TEXT_COL, TYPE_COL, BIN_COL]].to_csv(base_path, index=False, encoding='utf-8')
print("베이스 저장:", base_path, "| rows:", len(base_out))


▶ Build undersample base ... done in 0.01s

언더샘플 베이스 분포:
 Type
Misdirection         196
Not Dark Pattern    1178
Scarcity             400
Social Proof         316
Urgency              216
Name: count, dtype: int64
베이스 저장: /content/drive/MyDrive/404DNF/outputs0923/balanced_base_before_over.csv | rows: 2306


In [4]:
# ===== 셀 3: 유틸 (need 계산/최종 합치기) =====
def get_need_counts(df_base: pd.DataFrame, labels_for_over: list, targets: dict) -> dict:
    need = {}
    for lab in labels_for_over:
        cur = int((df_base[TYPE_COL]==lab).sum())
        need[lab] = max(0, targets[lab]-cur)
    return need

def finalize_with_aug(df_base: pd.DataFrame, aug_df: pd.DataFrame, targets: dict) -> pd.DataFrame:
    out = pd.concat([df_base, aug_df], ignore_index=True) if (aug_df is not None and len(aug_df)) else df_base.copy()
    # 상한 컷
    fixed = []
    for lab, t in targets.items():
        sub = out[out[TYPE_COL]==lab]
        if len(sub) > t:
            sub = sub.sample(n=t, random_state=SEED)
        fixed.append(sub)
    out2 = pd.concat(fixed, ignore_index=True)

    # BIN 라벨 보정
    out2.loc[out2[TYPE_COL]=="Not Dark Pattern", BIN_COL] = 0
    out2.loc[out2[TYPE_COL]!="Not Dark Pattern", BIN_COL] = 1
    out2[BIN_COL] = out2[BIN_COL].astype(int)

    # 셔플
    out2 = out2.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
    return out2[[TEXT_COL, TYPE_COL, BIN_COL]]

# need 확인
need_counts = get_need_counts(df_under_base, labels_over, TARGETS)
print("오버샘플 필요량:", need_counts, "| 총합:", sum(need_counts.values()))

오버샘플 필요량: {'Not Dark Pattern': 422, 'Social Proof': 84, 'Urgency': 184, 'Misdirection': 204} | 총합: 894


In [10]:
# ===== 셀 4: 템플릿 증강 (확장, 진행바/로그 추가) =====
from tqdm.auto import tqdm
import time, re

# tic/toc 유틸
def tic(msg=""):
    t = time.time()
    if msg:
        print(f"▶ {msg} ...")
    return t

def toc(t):
    print(f"done in {time.time()-t:.2f}s")

# ---- 슬롯 정의 (기호 최소화 버전) ----
time_slots = [
    "5 minutes","10 minutes","15 minutes","20 minutes","30 minutes","45 minutes",
    "1 hour","2 hours","3 hours","6 hours","12 hours","24 hours",
    "today","tonight","tomorrow","this weekend","this week only","by midnight","next Monday"
]
deadline_slots = [
    "today","tonight","this evening","by midnight",
    "before it is gone","this week only","next week","end of month"
]
stock_slots = [str(x) for x in range(2,101)]
discount_slots = [str(x) for x in range(5,81,5)]
recent_slots = [
    "just now","in the last 10 minutes","in the last hour","in the past 3 hours",
    "today","this morning","this evening","this week","over the weekend"
]
fee_slots = [
    "service fee","processing fee","handling fee","membership fee",
    "activation fee","administration fee","transaction fee","booking fee"
]
bonus_slots = [
    "free gift","bonus points","extra coupon","exclusive access","welcome credit",
    "trial upgrade","cashback reward","discount voucher","loyalty bonus","VIP access"
]
channel_slots = [
    "homepage","checkout","banner","pop up","email link","mobile app","push notification",
    "landing page","product page"
]
cta_slots = [
    "Buy now","Get deal","Claim offer","Continue","Next","Apply coupon","Checkout",
    "Sign up","Join now","Reserve","Subscribe"
]

# 보조 슬롯
product_slots = [
    "sneakers","wireless earbuds","smartwatch","coffee maker","gaming mouse","standing desk",
    "luggage set","photo editor","cloud storage","VPN plan","meal kit","online course",
    "premium theme","fitness app","note taking app","AI credits","gift card","hotel deal",
    "flight ticket","web hosting","domain name","camera lens","portable SSD","power bank"
]
plan_slots = ["Basic","Standard","Plus","Pro","Premium","Family","Business","Enterprise","Annual","Monthly"]
addon_slots = ["priority support","extended warranty","ad free experience","cloud backup","data export","extra storage","gift wrap"]
badge_slots = ["Bestseller","Popular","Editors Pick","Trending","Top Choice","Hot","Limited"]
region_slots = ["in your area","near you","in your city","in your region","in Korea","in Asia"]
stars_slots = ["5 stars","4 stars","3 stars","4.5 stars","5 stars editor rated"]
price_slots = ["USD 9.99","USD 14.99","USD 19.99","USD 29.99","USD 49.99","USD 99.00","KRW 9900","KRW 19900","KRW 29900","KRW 49000"]
reason_slots = [
    "best value for money","most flexible option","easiest to set up",
    "recommended by experts","top rated for reliability","our most popular choice"
]

def S(x): return rng.choice(x)
def R(x): return rng.choice(x)

# ---- 템플릿 정의 (기호 최소화) ----
URGENCY_TEMPLATES = [
    "Only {time} left to claim your {discount} percent discount",
    "Hurry just {stock} items remaining Do not miss out",
    "Offer ends {deadline} Secure your deal now",
    "Final hours save {discount} percent if you order within {time}",
    "Act fast Only {stock} left in stock",
    "Limited time deal {discount} percent off ends {deadline}",
    "Countdown alert {time} remaining to save {discount} percent",
    "Sale ends {deadline} This is your last chance",
    "Flash sale {discount} percent off for the next {time}",
    "Going fast {stock} units left and counting",
    "Hurry Price increases after {deadline}",
    "This price will not last {discount} percent off through {deadline}",
    "Last chance apply coupon at {channel} before {deadline}",
    "Only {time} to lock in this offer {cta}",
    "Deal expires {deadline} remaining stock {stock}",
    "Book within {time} to receive an extra {discount} percent off",
    "Ending soon cart will reset after {time}",
    "Save {discount} percent if you complete checkout by {deadline}",
    "Limited window {time} left to qualify for free shipping",
    "Offer valid today only checkout within {time}",
    "Selling out quickly {stock} items left as of {recent}",
    "{discount} percent off ends {deadline} Do not miss it",
    "Move fast {time} left to secure promotional pricing",
    "Only {stock} remaining across all sizes",
    "Closes {deadline} {cta} to secure your discount",
    "Early bird pricing ends {deadline}",
    "Reserve your {product} now deal vanishes {deadline}",
    "Cart hold expires in {time} complete checkout to keep your {product}",
    "Limited drop next batch ships {deadline}",
    "Intro price ends {deadline} renewals at regular rate",
    "Preorder window closes {deadline} secure your spot",
    "Only {stock} codes left for {discount} percent off",
    "Last seats at this fare until {deadline}",
    "Price jumps after {deadline} lock {price} now",
    "This {product} is time sensitive {cta} before {deadline}",
    "Offer valid {recent} will auto expire {deadline}",
    "Countdown sale {discount} percent for {time} on {product}",
    "Redeem within {time} to keep the promotional bonus",
    "Deal refreshes {deadline} current perks disappear",
    "Limited sign up window enroll by {deadline}",
]

MISDIRECTION_TEMPLATES = [
    "Continue to claim your {bonus} auto applies at checkout",
    "Your {bonus} is ready press Continue to proceed",
    "Best value pre selected for you change anytime",
    "You are almost done Click Next to get started",
    "Recommended option selected you can modify later",
    "Processing will be faster with Express Checkout selected",
    "Your discount is applied Additional {fee} may appear at checkout",
    "Almost there confirm details to receive {bonus}",
    "Default plan set to {plan} for a better experience",
    "{bonus} unlocked selection is already chosen for you",
    "We have applied the optimal settings automatically",
    "Continue to keep your {bonus} going back may remove it",
    "The fastest route is pre checked for your convenience",
    "By proceeding you will keep the current promotional rate",
    "Best seller highlighted already added to your bundle",
    "Optional add on is selected to maximize your savings",
    "Express option chosen you can switch in settings later",
    "Savings applied at {channel} {cta} to finalize",
    "Agree and continue keeps your current benefits",
    "We have set recommended extras you can remove later",
    "Your trial auto renews press {cta} to keep access",
    "Keep {bonus} active by clicking {cta}",
    "Priority support is included pre selected",
    "Promo applied Taxes and {fee} may be shown at checkout",
    "To retain discounts proceed with the recommended bundle",
    "Smart picks enabled some options are selected for you",
    "Add insurance is selected by default recommended",
    "Continue to confirm your {plan} plan you can downgrade later",
    "Auto renew is enabled to avoid interruption toggle anytime in settings",
    "Unchecking extras may reduce your benefits",
    "Best experience {addon} is enabled for you",
    "By clicking {cta} you accept promotional communications",
    "It is faster with saved card preselected at checkout",
    "Your cart includes {addon} free to remove later",
    "The {plan} plan is our most popular pre chosen to save you time",
    "We will keep your preferences cookies required if you hit {cta}",
    "Free trial starts now billing begins automatically after {time}",
    "Cancel option is under Account Billing Manage Cancel",
    "No I do not want savings secondary option will remove benefits",
    "To keep {bonus} proceed with the recommended {plan} plan",
    "Advanced settings are available after purchase",
    "Decline protection small text or {cta} to continue",
    "Your {product} includes {addon} unless removed",
    "Apply best settings automatically recommended",
    "Pre checked newsletter keeps you updated opt out anytime",
    "We have selected the safest defaults adjust later",
    "Continue maintains your perks Back may reset choices",
    "To access the discount agree to terms on the next page",
    "Add {addon} for the full experience pre selected",
    "To finish sign up accept optional terms pre ticked",
    "Your {plan} discount applies if you proceed now",
    "Skip step appears after you click {cta}",
    "Cancellation path continues after several steps",
    "Important info in More details collapsed",
    "Select View offers to keep current {plan}",
    "Optional {addon} is included remove to proceed without it",
    "Continue with {plan} to keep {reason}",
]

SOCIAL_PROOF_TEMPLATES = [
    "Join over {stock} people who already purchased this",
    "Popular choice {stock} users selected this today",
    "Trending now {stock} customers are viewing this item",
    "Most picked in the last {time}",
    "Top rated thousands trust this option",
    "Currently {stock} people have this in their carts",
    "Hot right now purchased {recent}",
    "Bestseller in its category this week",
    "People like you are buying this do not miss out",
    "Frequently bought together by {stock} plus customers",
    "Loved by our community see why {stock} plus chose it",
    "Customer favorite add to cart before it is gone",
    "{stock} others checked out this option {recent}",
    "Top pick for value according to recent buyers",
    "Popular badge earned from {stock} plus purchases",
    "Often recommended customers keep coming back",
    "High demand {stock} views in {time}",
    "Selected by {stock} users in the last {time}",
    "Most added to cart this {recent}",
    "Currently trending on {channel}",
    "Ranked number one by shoppers this week",
    "Thousands switched to this make it yours",
    "Rated {stars} by {stock} plus customers",
    "Trusted by teams using the {plan} plan",
    "Friends near you bought this {recent}",
    "Seen on {channel} {stock} clicks today",
    "{stock} plus verified purchases {recent}",
    "Popular among {plan} users seeking {reason}",
    "Chosen by creators for {reason}",
    "Join {stock} plus subscribers enjoying {addon}",
    "Real time {stock} viewing this {product} now",
    "Your neighbors {region} just ordered this",
    "Most wished for on {channel} this week",
    "Endorsed as {badge} for {reason}",
    "Top {product} pick among students {region}",
    "This {product} is trending with {plan} members",
]

NDP_TEMPLATES = [
    "This page contains no special offers or time limits",
    "Browse freely without urgency or pre selected options",
    "No countdowns endorsements or nudges are presented",
    "Standard pricing applies with no extra {fee}",
    "No auto selection settings remain unchanged unless you choose",
    "No popular badge or social indicators are shown",
    "No default add ons you can manually opt in if desired",
    "No limited time messaging is displayed",
    "No recommended for you prompts appear here",
    "No automatic upsell only the base option is shown",
    "No scarcity claims or stock warnings on this page",
    "Neutral presentation you decide at your own pace",
    "No bundled extras each choice is explicit",
    "No urgency language offers are not time bound",
    "No social proof elements are surfaced",
    "All options are off by default enable only what you need",
    "You can review features without any pre applied discounts",
    "There are no time sensitive banners or pop ups",
    "No items are hidden or pre checked during checkout",
    "The {plan} plan is not preselected choose manually",
    "No cross sell cards only core information is shown",
    "No badges or ranking labels are applied to products",
    "No inventory messaging or popularity counters are displayed",
    "No suggested bundles each item is listed individually",
    "No email capture gates browsing is open",
    "Pricing is flat and does not change at checkout",
    "No countdown timers or expiring coupons",
    "No default newsletter opt ins subscriptions are optional",
    "No location based recommendations are shown",
    "No hover driven prompts interactions are explicit clicks",
    "No sticky bars page remains uncluttered",
    "Checkout steps are linear with clear labels",
    "No extra fees beyond taxes {fee} is not applied",
    "No third party endorsements or influencer quotes",
    "No behavioral nudges decisions are left to the user",
]

# 유니크 확보 세팅
POOL_MULTIPLIER = 10
MIN_EXTRA = 200

# (선택) 기호 제거기: 영어 대문자/소문자/숫자/공백만 남김
SANITIZE = True
def clean_text(t: str) -> str:
    t = re.sub(r"\s+", " ", t).strip()
    if SANITIZE:
        t = re.sub(r"[^A-Za-z0-9 ]+", " ", t)
        t = re.sub(r"\s+", " ", t).strip()
    return t

from tqdm.auto import tqdm

def generate_by_templates(label: str, n: int, verbose=True) -> pd.DataFrame:
    if n <= 0:
        return pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])
    base = (URGENCY_TEMPLATES if label=='Urgency' else
            MISDIRECTION_TEMPLATES if label=='Misdirection' else
            SOCIAL_PROOF_TEMPLATES if label=='Social Proof' else
            NDP_TEMPLATES)

    pool_size = max(n*POOL_MULTIPLIER, n+MIN_EXTRA)
    if verbose:
        print(f"  - {label}: need {n}, pool {pool_size}")

    candidates = []
    for _ in tqdm(range(pool_size), desc=f"{label} pool", leave=False):
        t = (rng.choice(base).format(
            time=S(time_slots), deadline=S(deadline_slots),
            stock=S(stock_slots), recent=S(recent_slots),
            discount=S(discount_slots), fee=S(fee_slots),
            bonus=S(bonus_slots), channel=S(channel_slots),
            cta=S(cta_slots), product=R(product_slots),
            plan=R(plan_slots), addon=R(addon_slots),
            badge=R(badge_slots), region=R(region_slots),
            stars=R(stars_slots), price=R(price_slots),
            reason=R(reason_slots)
        ))
        candidates.append(clean_text(t))

    uniq = list(dict.fromkeys(candidates))

    if len(uniq) < n:
        for _ in tqdm(range(n-len(uniq)), desc=f"{label} top-up", leave=False):
            t = (rng.choice(base).format(
                time=S(time_slots), deadline=S(deadline_slots),
                stock=S(stock_slots), recent=S(recent_slots),
                discount=S(discount_slots), fee=S(fee_slots),
                bonus=S(bonus_slots), channel=S(channel_slots),
                cta=S(cta_slots), product=R(product_slots),
                plan=R(plan_slots), addon=R(addon_slots),
                badge=R(badge_slots), region=R(region_slots),
                stars=R(stars_slots), price=R(price_slots),
                reason=R(reason_slots)
            ))
            t = clean_text(t)
            if t not in uniq:
                uniq.append(t)

    rows = [{TEXT_COL: s, TYPE_COL: label, BIN_COL: (0 if label=="Not Dark Pattern" else 1)}
            for s in uniq[:n]]
    return pd.DataFrame(rows, columns=[TEXT_COL, TYPE_COL, BIN_COL])

# 실행
t0 = tic("Template augment")
need_counts = get_need_counts(df_under_base, labels_over, TARGETS)
print("need_counts:", need_counts)

aug_list = []
for lab, need in need_counts.items():
    if need <= 0:
        continue
    t_lab = time.time()
    print(f"[{lab}] start")
    df_tmp = generate_by_templates(lab, need, verbose=True)
    print(f"[{lab}] done in {time.time()-t_lab:.2f}s | rows={len(df_tmp)}")
    aug_list.append(df_tmp)

df_aug_template = pd.concat(aug_list, ignore_index=True) if aug_list else pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])
print("df_aug_template shape:", df_aug_template.shape)

df_template_final = finalize_with_aug(df_under_base, df_aug_template, TARGETS)
toc(t0)

print("\n템플릿 균형 분포:\n", df_template_final[TYPE_COL].value_counts().sort_index())
print("샘플 3개:\n", df_template_final.head(3))

▶ Template augment ...
need_counts: {'Not Dark Pattern': 422, 'Social Proof': 84, 'Urgency': 184, 'Misdirection': 204}
[Not Dark Pattern] start
  - Not Dark Pattern: need 422, pool 4220


Not Dark Pattern pool:   0%|          | 0/4220 [00:00<?, ?it/s]

Not Dark Pattern top-up:   0%|          | 0/364 [00:00<?, ?it/s]

[Not Dark Pattern] done in 1.36s | rows=58
[Social Proof] start
  - Social Proof: need 84, pool 840


Social Proof pool:   0%|          | 0/840 [00:00<?, ?it/s]

[Social Proof] done in 0.26s | rows=84
[Urgency] start
  - Urgency: need 184, pool 1840


Urgency pool:   0%|          | 0/1840 [00:00<?, ?it/s]

[Urgency] done in 0.56s | rows=184
[Misdirection] start
  - Misdirection: need 204, pool 2040


Misdirection pool:   0%|          | 0/2040 [00:00<?, ?it/s]

[Misdirection] done in 0.62s | rows=204
df_aug_template shape: (530, 3)
done in 2.82s

템플릿 균형 분포:
 Type
Misdirection         400
Not Dark Pattern    1236
Scarcity             400
Social Proof         400
Urgency              400
Name: count, dtype: int64
샘플 3개:
                                               String              Type  label
0  Installation InstructionsPolicies & Terms of U...  Not Dark Pattern      0
1                                   LIMITED QUANTITY          Scarcity      1
2                              Only 11 Left In Stock          Scarcity      1


In [11]:
# ===== 셀 4: 템플릿 증강 (확장, 진행바/로그 추가, NDP 전용 슬롯 처리) =====
from tqdm.auto import tqdm
import time, re

# tic/toc 유틸
def tic(msg=""):
    t = time.time()
    if msg:
        print(f"▶ {msg} ...")
    return t

def toc(t):
    print(f"done in {time.time()-t:.2f}s")

# ---- 슬롯 정의 (기호 최소화 버전) ----
time_slots = ["5 minutes","10 minutes","15 minutes","20 minutes","30 minutes","45 minutes",
              "1 hour","2 hours","3 hours","6 hours","12 hours","24 hours",
              "today","tonight","tomorrow","this weekend","this week only","by midnight","next Monday"]
deadline_slots = ["today","tonight","this evening","by midnight","before it is gone","this week only","next week","end of month"]
stock_slots = [str(x) for x in range(2,101)]
discount_slots = [str(x) for x in range(5,81,5)]
recent_slots = ["just now","in the last 10 minutes","in the last hour","in the past 3 hours",
                "today","this morning","this evening","this week","over the weekend"]
fee_slots = ["service fee","processing fee","handling fee","membership fee",
             "activation fee","administration fee","transaction fee","booking fee"]
bonus_slots = ["free gift","bonus points","extra coupon","exclusive access","welcome credit",
               "trial upgrade","cashback reward","discount voucher","loyalty bonus","VIP access"]
channel_slots = ["homepage","checkout","banner","pop up","email link","mobile app","push notification",
                 "landing page","product page"]
cta_slots = ["Buy now","Get deal","Claim offer","Continue","Next","Apply coupon","Checkout",
             "Sign up","Join now","Reserve","Subscribe"]

# 보조 슬롯
product_slots = ["sneakers","wireless earbuds","smartwatch","coffee maker","gaming mouse","standing desk",
                 "luggage set","photo editor","cloud storage","VPN plan","meal kit","online course",
                 "premium theme","fitness app","note taking app","AI credits","gift card","hotel deal",
                 "flight ticket","web hosting","domain name","camera lens","portable SSD","power bank"]
plan_slots = ["Basic","Standard","Plus","Pro","Premium","Family","Business","Enterprise","Annual","Monthly"]
addon_slots = ["priority support","extended warranty","ad free experience","cloud backup","data export","extra storage","gift wrap"]
badge_slots = ["Bestseller","Popular","Editors Pick","Trending","Top Choice","Hot","Limited"]
region_slots = ["in your area","near you","in your city","in your region","in Korea","in Asia"]
stars_slots = ["5 stars","4 stars","3 stars","4.5 stars","5 stars editor rated"]
price_slots = ["USD 9.99","USD 14.99","USD 19.99","USD 29.99","USD 49.99","USD 99.00","KRW 9900","KRW 19900","KRW 29900","KRW 49000"]
reason_slots = ["best value for money","most flexible option","easiest to set up",
                "recommended by experts","top rated for reliability","our most popular choice"]

# --- NDP 전용 슬롯 ---
ndp_page_slots = ["this page","this screen","this view","this section","this step","this form",
                  "the checkout page","the product page","the pricing page","the settings page",
                  "the details page","the information page"]
ndp_action_slots = ["browse","review","compare","proceed","continue","navigate","scroll",
                    "check out","inspect","view","select","read"]
ndp_info_slots = ["no timers","no auto selection","no default add ons","no badges",
                  "no popularity counters","no hidden fees","no promotional banners",
                  "no pre applied discounts","no inventory messages","no endorsements",
                  "no pop ups","no prompts"]
ndp_syn_slots = ["there are","there is","we show","we present","you will see","you can proceed with",
                 "the page presents","the page provides","the page displays","the interface shows"]

def S(x): return rng.choice(x)
def R(x): return rng.choice(x)

# ---- 템플릿 정의 (URGENCY, MISDIRECTION, SOCIAL_PROOF, NDP) ----
URGENCY_TEMPLATES = [
    "Only {time} left to claim your {discount} percent discount",
    "Hurry just {stock} items remaining Do not miss out",
    "Offer ends {deadline} Secure your deal now",
    "Final hours save {discount} percent if you order within {time}",
    "Act fast Only {stock} left in stock",
    "Limited time deal {discount} percent off ends {deadline}",
    "Countdown alert {time} remaining to save {discount} percent",
    "Sale ends {deadline} This is your last chance",
    "Flash sale {discount} percent off for the next {time}",
    "Going fast {stock} units left and counting",
    "Hurry Price increases after {deadline}",
    "This price will not last {discount} percent off through {deadline}",
    "Last chance apply coupon at {channel} before {deadline}",
    "Only {time} to lock in this offer {cta}",
    "Deal expires {deadline} remaining stock {stock}",
    "Book within {time} to receive an extra {discount} percent off",
    "Ending soon cart will reset after {time}",
    "Save {discount} percent if you complete checkout by {deadline}",
    "Limited window {time} left to qualify for free shipping",
    "Offer valid today only checkout within {time}",
    "Selling out quickly {stock} items left as of {recent}",
    "{discount} percent off ends {deadline} Do not miss it",
    "Move fast {time} left to secure promotional pricing",
    "Only {stock} remaining across all sizes",
    "Closes {deadline} {cta} to secure your discount",
    "Early bird pricing ends {deadline}",
    "Reserve your {product} now deal vanishes {deadline}",
    "Cart hold expires in {time} complete checkout to keep your {product}",
    "Limited drop next batch ships {deadline}",
    "Intro price ends {deadline} renewals at regular rate",
    "Preorder window closes {deadline} secure your spot",
    "Only {stock} codes left for {discount} percent off",
    "Last seats at this fare until {deadline}",
    "Price jumps after {deadline} lock {price} now",
    "This {product} is time sensitive {cta} before {deadline}",
    "Offer valid {recent} will auto expire {deadline}",
    "Countdown sale {discount} percent for {time} on {product}",
    "Redeem within {time} to keep the promotional bonus",
    "Deal refreshes {deadline} current perks disappear",
    "Limited sign up window enroll by {deadline}",
]

MISDIRECTION_TEMPLATES = [
    "Continue to claim your {bonus} auto applies at checkout",
    "Your {bonus} is ready press Continue to proceed",
    "Best value pre selected for you change anytime",
    "You are almost done Click Next to get started",
    "Recommended option selected you can modify later",
    "Processing will be faster with Express Checkout selected",
    "Your discount is applied Additional {fee} may appear at checkout",
    "Almost there confirm details to receive {bonus}",
    "Default plan set to {plan} for a better experience",
    "{bonus} unlocked selection is already chosen for you",
    "We have applied the optimal settings automatically",
    "Continue to keep your {bonus} going back may remove it",
    "The fastest route is pre checked for your convenience",
    "By proceeding you will keep the current promotional rate",
    "Best seller highlighted already added to your bundle",
    "Optional add on is selected to maximize your savings",
    "Express option chosen you can switch in settings later",
    "Savings applied at {channel} {cta} to finalize",
    "Agree and continue keeps your current benefits",
    "We have set recommended extras you can remove later",
    "Your trial auto renews press {cta} to keep access",
    "Keep {bonus} active by clicking {cta}",
    "Priority support is included pre selected",
    "Promo applied Taxes and {fee} may be shown at checkout",
    "To retain discounts proceed with the recommended bundle",
    "Smart picks enabled some options are selected for you",
    "Add insurance is selected by default recommended",
    "Continue to confirm your {plan} plan you can downgrade later",
    "Auto renew is enabled to avoid interruption toggle anytime in settings",
    "Unchecking extras may reduce your benefits",
    "Best experience {addon} is enabled for you",
    "By clicking {cta} you accept promotional communications",
    "It is faster with saved card preselected at checkout",
    "Your cart includes {addon} free to remove later",
    "The {plan} plan is our most popular pre chosen to save you time",
    "We will keep your preferences cookies required if you hit {cta}",
    "Free trial starts now billing begins automatically after {time}",
    "Cancel option is under Account Billing Manage Cancel",
    "No I do not want savings secondary option will remove benefits",
    "To keep {bonus} proceed with the recommended {plan} plan",
    "Advanced settings are available after purchase",
    "Decline protection small text or {cta} to continue",
    "Your {product} includes {addon} unless removed",
    "Apply best settings automatically recommended",
    "Pre checked newsletter keeps you updated opt out anytime",
    "We have selected the safest defaults adjust later",
    "Continue maintains your perks Back may reset choices",
    "To access the discount agree to terms on the next page",
    "Add {addon} for the full experience pre selected",
    "To finish sign up accept optional terms pre ticked",
    "Your {plan} discount applies if you proceed now",
    "Skip step appears after you click {cta}",
    "Cancellation path continues after several steps",
    "Important info in More details collapsed",
    "Select View offers to keep current {plan}",
    "Optional {addon} is included remove to proceed without it",
    "Continue with {plan} to keep {reason}",
]

SOCIAL_PROOF_TEMPLATES = [
    "Join over {stock} people who already purchased this",
    "Popular choice {stock} users selected this today",
    "Trending now {stock} customers are viewing this item",
    "Most picked in the last {time}",
    "Top rated thousands trust this option",
    "Currently {stock} people have this in their carts",
    "Hot right now purchased {recent}",
    "Bestseller in its category this week",
    "People like you are buying this do not miss out",
    "Frequently bought together by {stock} plus customers",
    "Loved by our community see why {stock} plus chose it",
    "Customer favorite add to cart before it is gone",
    "{stock} others checked out this option {recent}",
    "Top pick for value according to recent buyers",
    "Popular badge earned from {stock} plus purchases",
    "Often recommended customers keep coming back",
    "High demand {stock} views in {time}",
    "Selected by {stock} users in the last {time}",
    "Most added to cart this {recent}",
    "Currently trending on {channel}",
    "Ranked number one by shoppers this week",
    "Thousands switched to this make it yours",
    "Rated {stars} by {stock} plus customers",
    "Trusted by teams using the {plan} plan",
    "Friends near you bought this {recent}",
    "Seen on {channel} {stock} clicks today",
    "{stock} plus verified purchases {recent}",
    "Popular among {plan} users seeking {reason}",
    "Chosen by creators for {reason}",
    "Join {stock} plus subscribers enjoying {addon}",
    "Real time {stock} viewing this {product} now",
    "Your neighbors {region} just ordered this",
    "Most wished for on {channel} this week",
    "Endorsed as {badge} for {reason}",
    "Top {product} pick among students {region}",
    "This {product} is trending with {plan} members",
]

NDP_TEMPLATES = [
    "This page contains no special offers or time limits",
    "Browse freely without urgency or pre selected options",
    "No countdowns endorsements or nudges are presented",
    "Standard pricing applies with no extra {fee}",
    "No auto selection settings remain unchanged unless you choose",
    "No popular badge or social indicators are shown",
    "No default add ons you can manually opt in if desired",
    "No limited time messaging is displayed",
    "No recommended for you prompts appear here",
    "No automatic upsell only the base option is shown",
    "No scarcity claims or stock warnings on this page",
    "Neutral presentation you decide at your own pace",
    "No bundled extras each choice is explicit",
    "No urgency language offers are not time bound",
    "No social proof elements are surfaced",
    # 가변 문장들
    "{page} shows {info} so you can {act} at your own pace",
    "{syn} {info} and you may {act} without prompts",
    "{page} provides a neutral layout with {info}",
    "You can {act} here with {info} and no badges",
    "A simple layout is provided with {info}",
    "{page} avoids urgency messages and keeps {info}",
    "You can {act} without pre applied discounts or badges",
    "{syn} clear choices only and {info}",
    "No hidden steps and {info} on {page}",
    "All choices are explicit and {info} are avoided on {page}"
]
# === 유니크 확보 세팅 ===
POOL_MULTIPLIER = 10
MIN_EXTRA = 200
SANITIZE = True

def clean_text(t: str) -> str:
    t = re.sub(r"\s+", " ", t).strip()
    if SANITIZE:
        t = re.sub(r"[^A-Za-z0-9 ]+", " ", t)
        t = re.sub(r"\s+", " ", t).strip()
    return t

# --- 수정된 generate_by_templates ---
def generate_by_templates(label: str, n: int, verbose=True) -> pd.DataFrame:
    if n <= 0:
        return pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])

    base = (URGENCY_TEMPLATES if label=='Urgency' else
            MISDIRECTION_TEMPLATES if label=='Misdirection' else
            SOCIAL_PROOF_TEMPLATES if label=='Social Proof' else
            NDP_TEMPLATES)

    # NDP는 더 크게 풀 확보
    mult = POOL_MULTIPLIER*3 if label=="Not Dark Pattern" else POOL_MULTIPLIER
    extra = MIN_EXTRA*3 if label=="Not Dark Pattern" else MIN_EXTRA
    pool_size = max(n*mult, n+extra)

    if verbose:
        print(f"  - {label}: need {n}, pool {pool_size}")

    candidates = []
    for _ in tqdm(range(pool_size), desc=f"{label} pool", leave=False):
        if label == "Not Dark Pattern":
            t = rng.choice(base).format(
                fee=S(fee_slots),
                page=S(ndp_page_slots),
                act=S(ndp_action_slots),
                info=S(ndp_info_slots),
                syn=S(ndp_syn_slots)
            )
        else:
            t = rng.choice(base).format(
                time=S(time_slots), deadline=S(deadline_slots),
                stock=S(stock_slots), recent=S(recent_slots),
                discount=S(discount_slots), fee=S(fee_slots),
                bonus=S(bonus_slots), channel=S(channel_slots),
                cta=S(cta_slots), product=R(product_slots),
                plan=R(plan_slots), addon=R(addon_slots),
                badge=R(badge_slots), region=R(region_slots),
                stars=R(stars_slots), price=R(price_slots),
                reason=R(reason_slots)
            )
        candidates.append(clean_text(t))

    uniq = list(dict.fromkeys(candidates))

    # 부족하면 보충
    if len(uniq) < n:
        for _ in tqdm(range(n-len(uniq)), desc=f"{label} top-up", leave=False):
            if label == "Not Dark Pattern":
                t = rng.choice(base).format(
                    fee=S(fee_slots),
                    page=S(ndp_page_slots),
                    act=S(ndp_action_slots),
                    info=S(ndp_info_slots),
                    syn=S(ndp_syn_slots)
                )
            else:
                t = rng.choice(base).format(
                    time=S(time_slots), deadline=S(deadline_slots),
                    stock=S(stock_slots), recent=S(recent_slots),
                    discount=S(discount_slots), fee=S(fee_slots),
                    bonus=S(bonus_slots), channel=S(channel_slots),
                    cta=S(cta_slots), product=R(product_slots),
                    plan=R(plan_slots), addon=R(addon_slots),
                    badge=R(badge_slots), region=R(region_slots),
                    stars=R(stars_slots), price=R(price_slots),
                    reason=R(reason_slots)
                )
            t = clean_text(t)
            if t not in uniq:
                uniq.append(t)

    rows = [{TEXT_COL: s, TYPE_COL: label, BIN_COL: (0 if label=="Not Dark Pattern" else 1)}
            for s in uniq[:n]]
    return pd.DataFrame(rows, columns=[TEXT_COL, TYPE_COL, BIN_COL])

# 실행
t0 = tic("Template augment")
need_counts = get_need_counts(df_under_base, labels_over, TARGETS)
print("need_counts:", need_counts)

aug_list = []
for lab, need in need_counts.items():
    if need <= 0:
        continue
    t_lab = time.time()
    print(f"[{lab}] start")
    df_tmp = generate_by_templates(lab, need, verbose=True)
    print(f"[{lab}] done in {time.time()-t_lab:.2f}s | rows={len(df_tmp)}")
    aug_list.append(df_tmp)

df_aug_template = pd.concat(aug_list, ignore_index=True) if aug_list else pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])
print("df_aug_template shape:", df_aug_template.shape)

df_template_final = finalize_with_aug(df_under_base, df_aug_template, TARGETS)
toc(t0)

print("\n템플릿 균형 분포:\n", df_template_final[TYPE_COL].value_counts().sort_index())
print("샘플 3개:\n", df_template_final.head(3))

▶ Template augment ...
need_counts: {'Not Dark Pattern': 422, 'Social Proof': 84, 'Urgency': 184, 'Misdirection': 204}
[Not Dark Pattern] start
  - Not Dark Pattern: need 422, pool 12660


Not Dark Pattern pool:   0%|          | 0/12660 [00:00<?, ?it/s]

[Not Dark Pattern] done in 1.40s | rows=422
[Social Proof] start
  - Social Proof: need 84, pool 840


Social Proof pool:   0%|          | 0/840 [00:00<?, ?it/s]

[Social Proof] done in 0.26s | rows=84
[Urgency] start
  - Urgency: need 184, pool 1840


Urgency pool:   0%|          | 0/1840 [00:00<?, ?it/s]

[Urgency] done in 0.56s | rows=184
[Misdirection] start
  - Misdirection: need 204, pool 2040


Misdirection pool:   0%|          | 0/2040 [00:00<?, ?it/s]

[Misdirection] done in 0.62s | rows=204
df_aug_template shape: (894, 3)
done in 2.86s

템플릿 균형 분포:
 Type
Misdirection         400
Not Dark Pattern    1600
Scarcity             400
Social Proof         400
Urgency              400
Name: count, dtype: int64
샘플 3개:
                                   String          Type  label
0   Seen on product page 79 clicks today  Social Proof      1
1          Flash Sale ends in SHOP NOW         Urgency      1
2  17 people have viewed this wine today  Social Proof      1


In [13]:
# ===== 셀 5: 패러프레이즈 증강 (배치 + FP16) =====
!pip -q install transformers sentencepiece accelerate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

para_model_name = "tuner007/pegasus_paraphrase"
para_tok = AutoTokenizer.from_pretrained(para_model_name)
para_model = AutoModelForSeq2SeqLM.from_pretrained(
    para_model_name,
    torch_dtype=torch.float16
).to("cuda")

paraphraser = pipeline(
    "text2text-generation",
    model=para_model,
    tokenizer=para_tok,
    device=0
)

PARA_BATCH = 32

def do_paraphrase_batch(texts):
    outs = paraphraser(
        texts, max_length=64,
        num_return_sequences=1,
        do_sample=True, top_p=0.95, temperature=0.9
    )
    return [o['generated_text'].strip() for o in outs]

def build_paraphrase_aug(df_base: pd.DataFrame, label: str, n: int) -> pd.DataFrame:
    if n <= 0: return pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])
    src = df_base[df_base[TYPE_COL]==label][TEXT_COL].dropna().tolist()
    if not src: return pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])
    picked = rng.choice(src, size=n, replace=True).tolist()
    out_texts = []
    for i in range(0, len(picked), PARA_BATCH):
        batch = picked[i:i+PARA_BATCH]
        try:
            out_texts.extend(do_paraphrase_batch(batch))
        except:
            out_texts.extend(batch)
    rows = [{TEXT_COL: t, TYPE_COL: label, BIN_COL: (0 if label=="Not Dark Pattern" else 1)} for t in out_texts[:n]]
    return pd.DataFrame(rows, columns=[TEXT_COL, TYPE_COL, BIN_COL])

t0 = tic("Paraphrase augment")
need_counts = get_need_counts(df_under_base, labels_over, TARGETS)
aug_list = [build_paraphrase_aug(df_under_base, lab, need) for lab, need in need_counts.items() if need>0]
df_aug_para = pd.concat(aug_list, ignore_index=True) if aug_list else pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])
df_para_final = finalize_with_aug(df_under_base, df_aug_para, TARGETS)
toc(t0)

print("\n패러프레이즈 균형 분포:\n", df_para_final[TYPE_COL].value_counts().sort_index())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


▶ Paraphrase augment ...


Token indices sequence length is longer than the specified maximum sequence length for this model (129 > 60). Running this sequence through the model will result in indexing errors
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


done in 3.24s

패러프레이즈 균형 분포:
 Type
Misdirection         400
Not Dark Pattern    1600
Scarcity             400
Social Proof         400
Urgency              400
Name: count, dtype: int64


In [18]:
# ===== 셀 6: 컨텍스추얼(MLM) 증강 — n개 보장 버전 =====
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
import torch, time
from tqdm.auto import tqdm

def load_mlm_pipeline(model_name: str = "distilroberta-base"):
    print(f"[MLM] try: {model_name} on CUDA with bfloat16")
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    try:
        mdl = AutoModelForMaskedLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,   # A100 권장
            low_cpu_mem_usage=True
        ).to("cuda").eval()
        pipe = pipeline("fill-mask", model=mdl, tokenizer=tok, device=0)
        return pipe
    except Exception as e1:
        print(f"[MLM] bfloat16 init failed → {e1}\n[MLM] retry: float32 on CUDA")
        try:
            mdl = AutoModelForMaskedLM.from_pretrained(
                model_name,
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True
            ).to("cuda").eval()
            pipe = pipeline("fill-mask", model=mdl, tokenizer=tok, device=0)
            return pipe
        except Exception as e2:
            print(f"[MLM] float32 CUDA init failed → {e2}\n[MLM] fallback: CPU")
            mdl = AutoModelForMaskedLM.from_pretrained(
                model_name,
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True
            ).to("cpu").eval()
            pipe = pipeline("fill-mask", model=mdl, tokenizer=tok, device=-1)
            return pipe

# 1) 파이프라인 로드
mlm_model_name = "distilroberta-base"
fill_mask = load_mlm_pipeline(mlm_model_name)
MASK = fill_mask.tokenizer.mask_token

# 2) 하이퍼파라미터
MLM_BATCH = 32          # A100이면 32~64 가능
MAX_ROUNDS = 20         # n개 채울 때까지 라운드 반복 상한
VERBOSE = True

def ensure_mask(text: str) -> str:
    """항상 MASK가 들어가도록 보장.
    - 토큰이 3 미만이면 맨 끝에 MASK를 붙임
    - 충분히 길면 랜덤 위치 1곳을 MASK로 교체
    """
    toks = str(text).split()
    if len(toks) < 3:
        # 너무 짧은 문장: 끝에 MASK 추가
        return (str(text).strip() + " " + MASK).strip()
    idx = int(rng.integers(0, len(toks)))
    toks[idx] = MASK
    return " ".join(toks)

def fill_mask_batch(texts):
    outs = fill_mask(texts, top_k=1)
    # HF pipeline 출력 형식 보정
    results = []
    for o in outs:
        if isinstance(o, list) and o:
            results.append(o[0].get('token_str', '').strip())
        elif isinstance(o, dict):
            results.append(o.get('token_str', '').strip())
        else:
            results.append(None)
    return results

def build_mlm_aug(df_base: pd.DataFrame, label: str, n: int) -> pd.DataFrame:
    """라벨별로 정확히 n개 생성될 때까지 반복 수집."""
    if n <= 0:
        return pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])

    src = df_base[df_base[TYPE_COL]==label][TEXT_COL].dropna().tolist()
    if not src:
        return pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])

    collected = []
    rounds = 0
    if VERBOSE: print(f"  - [{label}] target {n}")

    while len(collected) < n and rounds < MAX_ROUNDS:
        rounds += 1
        need_now = n - len(collected)
        # 한 라운드에 약간 여유를 줘서 생성
        take = int(min(max(need_now * 1.2, MLM_BATCH), need_now + MLM_BATCH))
        picked = rng.choice(src, size=take, replace=True).tolist()

        masked_list = [ensure_mask(s) for s in picked]

        # 배치 처리
        batch_out = []
        for i in range(0, len(masked_list), MLM_BATCH):
            batch = masked_list[i:i+MLM_BATCH]
            try:
                toks = fill_mask_batch(batch)
                for s, tok in zip(batch, toks):
                    if (not tok) or (MASK not in s):
                        batch_out.append(s.replace("!", "."))
                    else:
                        batch_out.append(s.replace(MASK, tok, 1))
            except Exception as e:
                if VERBOSE: print(f"    [WARN][{label}] batch fail → {e}")
                batch_out.extend([s.replace("!", ".") for s in batch])

        collected.extend(batch_out)
        if VERBOSE: print(f"    round {rounds}: +{len(batch_out)} → {len(collected)}/{n}")

    # 잘라서 n개만 사용
    out_texts = collected[:n]
    rows = [{TEXT_COL: t, TYPE_COL: label, BIN_COL: (0 if label=="Not Dark Pattern" else 1)}
            for t in out_texts]
    return pd.DataFrame(rows, columns=[TEXT_COL, TYPE_COL, BIN_COL])

# 3) 실행
t0 = tic("Contextual augment")
need_counts = get_need_counts(df_under_base, labels_over, TARGETS)
print("need_counts:", need_counts)

aug_list = []
for lab, need in need_counts.items():
    if need <= 0:
        continue
    if VERBOSE: print(f"[{lab}] start")
    df_tmp = build_mlm_aug(df_under_base, lab, need)
    if VERBOSE: print(f"[{lab}] done | rows={len(df_tmp)}")
    aug_list.append(df_tmp)

df_aug_context = pd.concat(aug_list, ignore_index=True) if aug_list else pd.DataFrame(columns=[TEXT_COL, TYPE_COL, BIN_COL])
df_context_final = finalize_with_aug(df_under_base, df_aug_context, TARGETS)
toc(t0)

print("\n컨텍스추얼 균형 분포:\n", df_context_final[TYPE_COL].value_counts().sort_index())

[MLM] try: distilroberta-base on CUDA with bfloat16


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceCla

[MLM] bfloat16 init failed → CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

[MLM] retry: float32 on CUDA
[MLM] float32 CUDA init failed → CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

[MLM] fallback: CPU
▶ Contextual augment ...
need_counts: {'Not Dark Pattern': 422, 'Social Proof': 84, 'Urgency': 184, 'Misdirection': 204}
[Not Dark Pattern] start
  - [Not Dark Pattern] target 422
    round 1: +454 → 454/422
[Not Dark Pattern] done | rows=422
[Social Proof] start
  - [Social Proof] target 84
    round 1: +100 → 100/84
[

In [19]:
# ===== 셀 7: 저장 =====
def save_single(df_final: pd.DataFrame, name_prefix: str):
    out_path = os.path.join(OUTPUT_DIR, f'{name_prefix}_full.csv')
    df_final.to_csv(out_path, index=False, encoding='utf-8')
    print(f"[{name_prefix}] 저장 완료 → {out_path}")
    print(df_final[TYPE_COL].value_counts().sort_index())

save_single(df_template_final,  "template")
save_single(df_para_final,      "paraphrase")
save_single(df_context_final,   "contextual")

[template] 저장 완료 → /content/drive/MyDrive/404DNF/outputs0923/template_full.csv
Type
Misdirection         400
Not Dark Pattern    1600
Scarcity             400
Social Proof         400
Urgency              400
Name: count, dtype: int64
[paraphrase] 저장 완료 → /content/drive/MyDrive/404DNF/outputs0923/paraphrase_full.csv
Type
Misdirection         400
Not Dark Pattern    1600
Scarcity             400
Social Proof         400
Urgency              400
Name: count, dtype: int64
[contextual] 저장 완료 → /content/drive/MyDrive/404DNF/outputs0923/contextual_full.csv
Type
Misdirection         400
Not Dark Pattern    1600
Scarcity             400
Social Proof         400
Urgency              400
Name: count, dtype: int64
