In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
template = """
Title: {title}
Description: {description}
Details: This is a {variety} wine from {winery}.
Origin: It is produced in {region_1}, {province}, {country}.
Specs: The wine is designated as {designation} and received {points} points reviewed by {taster_name}. Price is ${price}.
"""

def _remove_duplicates(df):
    """중복 레코드 제거"""
    df_cleaned = df.drop_duplicates(subset='description', keep='first')
    return df_cleaned

def _remove_unnecessary_features(df):
    """불필요한 피처 제거 (taster_twitter_handle, region_2)"""
    columns_to_drop = ['taster_twitter_handle', 'region_2']
    existing_columns = [col for col in columns_to_drop if col in df.columns]
    if existing_columns:
        df = df.drop(columns=existing_columns)
    return df

def _clean_text(df):
    """텍스트 정제: 불필요한 공백 제거"""
    text_columns = ['description', 'designation', 'variety', 'country', 'province', 'region_1', 'winery', 'taster_name', 'title']

    for col in text_columns:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()

    return df

def _process_numeric_features(df, upper=1000):
    """수치형 데이터 처리: price 클리핑만 수행 (스케일링은 BERT용으로 제외)"""
    # price 클리핑 (상한값 1000)
    if 'price' in df.columns:
        df['price'] = df['price'].clip(upper=upper)

    return df

def _handle_missing_values(df):
    """결측값 처리"""
    # 수치형 피처: 중앙값으로 대체
    numeric_cols = ['price', 'points']
    for col in numeric_cols:
        if col in df.columns and df[col].isnull().any():
            median_value = df[col].median()
            df[col] = df[col].fillna(median_value)

    # 범주형 피처: "Unknown"으로 대체
    categorical_cols = ['country', 'designation', 'province', 'region_1', 'taster_name', 'variety', 'winery']
    for col in categorical_cols:
        if col in df.columns and df[col].isnull().any():
            df[col] = df[col].fillna('Unknown')

    # description과 title은 빈 문자열로 대체
    text_cols = ['description', 'title']
    for col in text_cols:
        if col in df.columns and df[col].isnull().any():
            df[col] = df[col].fillna('')

    return df

def _make_combined_text(df):
    """DataFrame의 컬럼들을 template을 사용하여 combined_text로 생성"""

    # template을 사용하여 combined_text 생성
    df['combined_text'] = df.apply(
        lambda row: template.format(
            description=row['description'],
            variety=row['variety'],
            winery=row['winery'],
            region_1=row['region_1'],
            province=row['province'],
            country=row['country'],
            designation=row['designation'],
            points=row['points'],
            price=row['price'],
            taster_name=row['taster_name'],
            title=row['title']
        ).strip(),
        axis=1
    )

    return df

def preprocess_data(df):
    df = _remove_duplicates(df)
    df = _remove_unnecessary_features(df)
    df = _handle_missing_values(df)
    df = _clean_text(df)
    df = _process_numeric_features(df, upper=1000)
    df = _make_combined_text(df)
    return df

In [4]:
import pandas as pd
df = pd.read_csv("/kaggle/input/wine-reviews/winemag-data-130k-v2.csv", index_col='Unnamed: 0')
df = preprocess_data(df)

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from datasets import Dataset

model_checkpoint = "sentence-transformers/all-mpnet-base-v2"

print(f"Loading {model_checkpoint}...")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

raw_text_data = df['combined_text'].tolist()
dataset = Dataset.from_dict({"text": raw_text_data})

max_length = 256

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_special_tokens_mask=True
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./mpnet-mlm-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    logging_steps=1000,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()

print("Saving model...")
model.save_pretrained("./final_mpnet_mlm")
tokenizer.save_pretrained("./final_mpnet_mlm")

In [None]:
print("\n--- Testing the Fine-Tuned Model ---")
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./final_mpnet_mlm",
    tokenizer="./final_mpnet_mlm"
)

result = fill_mask(f"This wine has firm {tokenizer.mask_token} and a long finish.")
print(result[0])