In [None]:
# โค้ดนี้จะช่วยในการเตรียมข้อมูลสำหรับการฝึกและทดสอบโมเดลการวิเคราะห์ข้อความ โดยการแยกข้อมูลตามข้อความ ความรู้สึก และหัวข้อ พร้อมทั้งบันทึกข้อมูลที่เตรียมไว้ให้สามารถใช้งานในภายหลังได้ โดยการใช้ pickle เก็บแผนที่และข้อมูลต่าง ๆ ทั้งหมดในไฟล์ .pkl ครับ
import os
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split

input_excel = r"C:\othai\ML_BERT\ML_Excel\externn.xlsx"
output_dir  = r"C:\othai\ML_BERT\Model\pk2"
os.makedirs(output_dir, exist_ok=True)

topic_list = [
    "Activity", "After-Service", "Appreciation", "Chinese Investors",
    "Common Area - Facilities", "Construction Materials", "Design", "Engaging",
    "Financial & Branding", "Intention", "Location", "Pet",
    "Politics (delete after)", "Price & Promotion", "Quality", "Security", "Space"
]
topic_map  = {t: i for i, t in enumerate(topic_list)}
sentiment_map = {"Positive": 0, "Neutral": 1, "Negative": 2}

df = pd.read_excel(input_excel)
df = df.dropna(subset=["message", "sentiment", "topic"])


df["message"] = df["message"].astype(str).str.strip().str.lower()
df = df.drop_duplicates(subset=["message"], keep="first")

df["sentiment_label"] = df["sentiment"].map(sentiment_map)
df["topic_label"]     = df["topic"].map(topic_map)

# Check for mapping errors
if df["sentiment_label"].isnull().any():
    raise ValueError("พบ sentiment ที่ map ไม่ได้:", df[df["sentiment_label"].isnull()])
if df["topic_label"].isnull().any():
    raise ValueError("พบ topic ที่ map ไม่ได้:", df[df["topic_label"].isnull()])


train_texts, test_texts, \
train_sentiments, test_sentiments, \
train_topics, test_topics = train_test_split(
    df["message"].tolist(),
    df["sentiment_label"].tolist(),
    df["topic_label"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["sentiment_label"]
)

# Save mapping for reuse
with open(os.path.join(output_dir, "topic_map.pkl"), "wb") as f:
    pickle.dump(topic_map, f)
with open(os.path.join(output_dir, "sentiment_map.pkl"), "wb") as f:
    pickle.dump(sentiment_map, f)


paths = {
    "train_texts":      os.path.join(output_dir, "train_texts.pkl"),
    "test_texts":       os.path.join(output_dir, "test_texts.pkl"),
    "train_sentiment":  os.path.join(output_dir, "train_sentiment.pkl"),
    "test_sentiment":   os.path.join(output_dir, "test_sentiment.pkl"),
    "train_topic":      os.path.join(output_dir, "train_topic.pkl"),
    "test_topic":       os.path.join(output_dir, "test_topic.pkl"),
}

data = {
    "train_texts":     train_texts,
    "test_texts":      test_texts,
    "train_sentiment": train_sentiments,
    "test_sentiment":  test_sentiments,
    "train_topic":     train_topics,
    "test_topic":      test_topics,
}

for name, arr in data.items():
    with open(paths[name], "wb") as f:
        pickle.dump(arr, f)
    print(f"Saved {name} → {paths[name]}")

print(f"Train samples: {len(train_texts)}, Test samples: {len(test_texts)}")


In [None]:
#โค้ดนี้ช่วยในการเตรียมข้อมูลสำหรับโมเดลภาษาโดยการแปลงข้อความเป็น tokens ซึ่งจะใช้ในกระบวนการฝึกและทดสอบโมเดล โดยทำการโหลดข้อมูลจากไฟล์ .pkl ที่มีข้อความแล้วใช้ AutoTokenizer แปลงข้อความเป็น PyTorch tensors ที่พร้อมใช้งานในขั้นตอนถัดไป จากนั้นบันทึกการเข้ารหัสเหล่านี้ไปยังไฟล์ใหม่ครับ
import argparse
import logging
import os
import pickle
from transformers import AutoTokenizer

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s"
)
logger = logging.getLogger(__name__)

#  Config Paths 
DATA_DIR          = r"C:\othai\ML_BERT\Model\pk2"
DEFAULT_TRAIN     = os.path.join(DATA_DIR, "train_texts.pkl")
DEFAULT_TEST      = os.path.join(DATA_DIR, "test_texts.pkl")
OUTPUT_DIR        = os.path.join(DATA_DIR, "encodings")
DEFAULT_TRAIN_OUT = os.path.join(OUTPUT_DIR, "train_encodings.pkl")
DEFAULT_TEST_OUT  = os.path.join(OUTPUT_DIR, "test_encodings.pkl")


def load_tokenizer(model_name: str, spm_model_path: str = None):
    """
    Load a slow tokenizer (use_fast=False).
    If a SentencePiece model file is given, pass it via sp_model_kwargs.
    """
    logger.info(f"Loading tokenizer (slow) for: {model_name}")
    kwargs = {"use_fast": False}
    if spm_model_path:
        logger.info(f"→ using SentencePiece model file: {spm_model_path}")
        kwargs["sp_model_kwargs"] = {"model_file": spm_model_path}
    return AutoTokenizer.from_pretrained(model_name, **kwargs)


def tokenize_texts(tokenizer, texts: list[str], max_length: int):
    """
    Tokenize a list of texts, returning PyTorch tensors.
    """
    logger.info(f"Tokenizing {len(texts)} texts (max_length={max_length})")
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )


def save_encodings(encodings, path: str):
    """
    Save PyTorch-tensor encodings to a pickle file.
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "wb") as f:
        pickle.dump(encodings, f)
    logger.info(f"Saved encodings → {path}")


def main():
    parser = argparse.ArgumentParser(
        description="Step 2: Tokenize texts & save encodings"
    )
    parser.add_argument(
        "--model_name",
        default="airesearch/wangchanberta-base-att-spm-uncased",
        help="HuggingFace model name for tokenizer"
    )
    parser.add_argument(
        "--spm_model",
        default=None,
        help="Optional path to SentencePiece .model file"
    )
    parser.add_argument(
        "--max_length",
        type=int,
        default=128,
        help="Maximum token length"
    )
    parser.add_argument(
        "--train_texts",
        default=DEFAULT_TRAIN,
        help="Path to train_texts.pkl"
    )
    parser.add_argument(
        "--test_texts",
        default=DEFAULT_TEST,
        help="Path to test_texts.pkl"
    )
    parser.add_argument(
        "--train_output",
        default=DEFAULT_TRAIN_OUT,
        help="Where to save train encodings"
    )
    parser.add_argument(
        "--test_output",
        default=DEFAULT_TEST_OUT,
        help="Where to save test encodings"
    )

    args, unknown = parser.parse_known_args()
    if unknown:
        logger.warning(f"Ignoring unknown args: {unknown}")

    tokenizer = load_tokenizer(args.model_name, args.spm_model)

    for split in ("train", "test"):
        in_path  = getattr(args, f"{split}_texts")
        out_path = getattr(args, f"{split}_output")
        if not os.path.exists(in_path):
            logger.error(f"File not found: {in_path}")
            continue
        with open(in_path, "rb") as f:
            texts = pickle.load(f)
        enc = tokenize_texts(tokenizer, texts, args.max_length)
        save_encodings(enc, out_path)

    logger.info("complete: encodings saved.")


if __name__ == "__main__":
    main()


In [None]:
#โค้ดนี้ใช้เพื่อเตรียมข้อมูลและโหลดข้อมูลที่เข้ารหัสแล้ว  tokenized text ของ sentiment สำหรับการใช้ใน PyTorch โมเดล มันจัดการการโหลดข้อมูลในรูปแบบ Dataset โดยสามารถใช้ GPU ได้ และยังรองรับการประมวลผลข้อมูลแบบขนานผ่าน multiprocessing เพื่อเพิ่มประสิทธิภาพในการโหลดข้อมูลครับ
import os
import pickle
import logging
import torch
from torch.utils.data import Dataset, DataLoader
import multiprocessing

#  Setup Logger 
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_pickle(path: str):
    if not os.path.exists(path):
        logger.error(f"File not found: {path}")
        raise FileNotFoundError(path)
    with open(path, "rb") as f:
        data = pickle.load(f)
    logger.info(f"Loaded {len(data)} items from {path}")
    return data

class CommentDataset(Dataset):
    def __init__(self, encodings: dict, labels: list[int]):
        n = len(next(iter(encodings.values())))
        if len(labels) != n:
            raise ValueError(f"Labels count ({len(labels)}) != encodings count ({n})")
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx: int):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

def batch_to_device(batch: dict, device: torch.device):
    return {k: v.to(device, non_blocking=True) for k, v in batch.items()}

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    if device.type == "cuda":
        torch.backends.cudnn.benchmark = True

    ENCODINGS_DIR = r"C:\othai\ML_BERT\Model\pk2\encodings"
    DATA_DIR      = r"C:\othai\ML_BERT\Model\pk2"
    train_enc_path = os.path.join(ENCODINGS_DIR, "train_encodings.pkl")
    test_enc_path  = os.path.join(ENCODINGS_DIR, "test_encodings.pkl")
    train_lbl_path = os.path.join(DATA_DIR, "train_sentiment.pkl")
    test_lbl_path  = os.path.join(DATA_DIR, "test_sentiment.pkl")

    train_enc = load_pickle(train_enc_path)
    test_enc  = load_pickle(test_enc_path)
    train_lbl = load_pickle(train_lbl_path)
    test_lbl  = load_pickle(test_lbl_path)

    batch_size = 8
    pin_memory = device.type == "cuda"
    num_workers = 0 if os.name == 'nt' else max(1, os.cpu_count() // 2)

    train_loader = DataLoader(
        CommentDataset(train_enc, train_lbl),
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=pin_memory
    )
    test_loader = DataLoader(
        CommentDataset(test_enc, test_lbl),
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory
    )

    logger.info(f"Train set: {len(train_loader.dataset)} samples, {len(train_loader)} batches (workers={num_workers})")
    logger.info(f"Test  set: {len(test_loader.dataset)} samples, {len(test_loader)} batches")

    try:
        batch = next(iter(train_loader))
        logger.info({k: v.shape for k, v in batch.items()})
        batch = batch_to_device(batch, device)
        logger.info(f"Batch moved to device: {{k: v.device for k, v in batch.items()}}")
    except Exception as e:
        logger.error(f"Failed to load a batch: {e}")

if __name__ == "__main__":
    multiprocessing.set_start_method("spawn", force=True)
    main()


In [None]:
#โค้ดนี้ใช้เพื่อเตรียมข้อมูลและโหลดข้อมูลที่เข้ารหัสแล้ว  tokenized text ของ topic สำหรับการใช้ใน PyTorch โมเดล มันจัดการการโหลดข้อมูลในรูปแบบ Dataset โดยสามารถใช้ GPU ได้ และยังรองรับการประมวลผลข้อมูลแบบขนานผ่าน multiprocessing เพื่อเพิ่มประสิทธิภาพในการโหลดข้อมูลครับ
import os
import pickle
import logging
import torch
from torch.utils.data import Dataset, DataLoader
import multiprocessing

#  Setup Logger 
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_pickle(path: str):
    if not os.path.exists(path):
        logger.error(f"File not found: {path}")
        raise FileNotFoundError(path)
    with open(path, "rb") as f:
        data = pickle.load(f)
    logger.info(f"Loaded {len(data)} items from {path}")
    return data

class CommentDataset(Dataset):
    def __init__(self, encodings: dict, labels: list[int]):
        n = len(next(iter(encodings.values())))
        if len(labels) != n:
            raise ValueError(f"Labels count ({len(labels)}) != encodings count ({n})")
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx: int):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

def batch_to_device(batch: dict, device: torch.device):
    return {k: v.to(device, non_blocking=True) for k, v in batch.items()}

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    if device.type == "cuda":
        torch.backends.cudnn.benchmark = True

    ENCODINGS_DIR = r"C:\othai\ML_BERT\Model\pk2\encodings"
    DATA_DIR      = r"C:\othai\ML_BERT\Model\pk2"
    train_enc_path = os.path.join(ENCODINGS_DIR, "train_encodings.pkl")
    test_enc_path  = os.path.join(ENCODINGS_DIR, "test_encodings.pkl")
    train_lbl_path = os.path.join(DATA_DIR, "train_topic.pkl")
    test_lbl_path  = os.path.join(DATA_DIR, "test_topic.pkl")

    train_enc = load_pickle(train_enc_path)
    test_enc  = load_pickle(test_enc_path)
    train_lbl = load_pickle(train_lbl_path)
    test_lbl  = load_pickle(test_lbl_path)

    batch_size = 8
    pin_memory = device.type == "cuda"
    num_workers = 0 if os.name == 'nt' else max(1, os.cpu_count() // 2)

    train_loader = DataLoader(
        CommentDataset(train_enc, train_lbl),
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=pin_memory
    )
    test_loader = DataLoader(
        CommentDataset(test_enc, test_lbl),
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory
    )

    logger.info(f"Train set: {len(train_loader.dataset)} samples, {len(train_loader)} batches (workers={num_workers})")
    logger.info(f"Test  set: {len(test_loader.dataset)} samples, {len(test_loader)} batches")

    try:
        batch = next(iter(train_loader))
        logger.info({k: v.shape for k, v in batch.items()})
        batch = batch_to_device(batch, device)
        logger.info(f"Batch moved to device: {{k: v.device for k, v in batch.items()}}")
    except Exception as e:
        logger.error(f"Failed to load a batch: {e}")

if __name__ == "__main__":
    multiprocessing.set_start_method("spawn", force=True)
    main()


In [None]:
#โต้ดนี้ใช้เพื่อฝึกโมเดลการจำแนกประเภทข้อความ โดยเป็นส่วนการฝึกโมเดลให้เรียนรู้ว่าเจอความคิดเห็นแบบไหนเป็นแง่บวก แง่ลบ หรือเป็นกลาง
import os
import pickle
import logging
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_pickle(path: str):
    if not os.path.exists(path):
        logger.error(f"File not found: {path}")
        raise FileNotFoundError(path)
    with open(path, "rb") as f:
        data = pickle.load(f)
    logger.info(f"Loaded {len(data)} items from {path}")
    return data

class CommentDataset(Dataset):
    def __init__(self, encodings: dict, labels: list[int]):
        n = len(next(iter(encodings.values())))
        if len(labels) != n:
            raise ValueError(f"Labels count ({len(labels)}) != encodings count ({n})")
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx: int):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

def compute_metrics(pred):
    labels = pred.label_ids
    preds  = pred.predictions.argmax(-1)
    acc    = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    if device.type == "cuda":
        torch.backends.cudnn.benchmark = True

    base_dir   = r"C:\othai\ML_BERT\Model\pk2"
    enc_dir    = os.path.join(base_dir, "encodings")
    output_dir = r"C:\othai\ML_BERT\Model\model_sentiment"
    model_name = "airesearch/wangchanberta-base-att-spm-uncased"
    num_labels = 3

    train_enc = load_pickle(os.path.join(enc_dir, "train_encodings.pkl"))
    test_enc  = load_pickle(os.path.join(enc_dir, "test_encodings.pkl"))
    train_lbl = load_pickle(os.path.join(base_dir, "train_sentiment.pkl"))
    test_lbl  = load_pickle(os.path.join(base_dir, "test_sentiment.pkl"))

    train_dataset = CommentDataset(train_enc, train_lbl)
    eval_dataset  = CommentDataset(test_enc,  test_lbl)

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    ).to(device)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=6,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.02,
        warmup_steps=int((len(train_dataset)//16)*6*0.1),
        logging_dir=os.path.join(output_dir, "logs"),
        logging_strategy="steps",
        logging_steps=100,
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=0 if os.name=='nt' else max(1, os.cpu_count()//2),
        seed=42
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    logger.info("Starting training...")
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    logger.info("Evaluating on test set...")
    metrics = trainer.evaluate()
    logger.info({k: f"{v:.4f}" for k, v in metrics.items()})

if __name__ == "__main__":
    main()


In [None]:
##โต้ดนี้ใช้เพื่อฝึกโมเดลการจำแนกประเภทข้อความ โดยเป็นส่วนการฝึกโมเดลให้เรียนรู้ว่าเพื่อจำแนก topic จากความคิดเห็นที่มีอยู่ โดยใช้โมเดล BERT ที่ถูกฝึกมาแล้ว
import os
import pickle
import logging
import torch
import numpy as np
from collections import Counter
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch import nn

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_pickle(path: str):
    if not os.path.exists(path):
        logger.error(f"File not found: {path}")
        raise FileNotFoundError(path)
    with open(path, "rb") as f:
        data = pickle.load(f)
    logger.info(f"Loaded {len(data)} items from {path}")
    return data

class CommentDataset(Dataset):
    def __init__(self, encodings: dict, labels: list[int]):
        n = len(next(iter(encodings.values())))
        if len(labels) != n:
            raise ValueError(f"Labels count ({len(labels)}) != encodings count ({n})")
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx: int):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

def compute_metrics(pred):
    labels = pred.label_ids
    preds  = pred.predictions.argmax(-1)
    acc    = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

class WeightedTrainer(Trainer):
    def __init__(self, *args, loss_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_weights = loss_weights
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=self.loss_weights.to(labels.device) if self.loss_weights is not None else None)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    if device.type == "cuda":
        torch.backends.cudnn.benchmark = True

    base_dir   = r"C:\othai\ML_BERT\Model\pk2"
    enc_dir    = os.path.join(base_dir, "encodings")
    output_dir = r"C:\othai\ML_BERT\Model\model1"
    model_name = "airesearch/wangchanberta-base-att-spm-uncased"
    num_labels = 17

    train_enc = load_pickle(os.path.join(enc_dir, "train_encodings.pkl"))
    test_enc  = load_pickle(os.path.join(enc_dir, "test_encodings.pkl"))
    train_lbl = load_pickle(os.path.join(base_dir, "train_topic.pkl"))
    test_lbl  = load_pickle(os.path.join(base_dir, "test_topic.pkl"))

    train_dataset = CommentDataset(train_enc, train_lbl)
    eval_dataset  = CommentDataset(test_enc,  test_lbl)

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels
    ).to(device)

    # Weighted loss
    label_counts = Counter(train_lbl)
    n_classes = 17
    weights = np.zeros(n_classes, dtype=np.float32)
    for i in range(n_classes):
        weights[i] = 1.0 / (label_counts[i] if label_counts[i] > 0 else 1)
    weights = weights * (len(train_lbl) / np.sum(weights))
    loss_weights = torch.tensor(weights, dtype=torch.float32)
    logger.info(f"Topic class weights: {loss_weights}")

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=20,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        learning_rate=1e-5,
        weight_decay=0.02,
        warmup_steps=int((len(train_dataset)//16)*10*0.1),
        logging_dir=os.path.join(output_dir, "logs"),
        logging_strategy="steps",
        logging_steps=100,
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=0 if os.name=='nt' else max(1, os.cpu_count()//2),
        seed=42
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        loss_weights=loss_weights
    )

    logger.info("Starting training...")
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    logger.info("Evaluating on test set...")
    metrics = trainer.evaluate()
    logger.info({k: f"{v:.4f}" for k, v in metrics.items()})

if __name__ == "__main__":
    main()


In [None]:
#โค้ดนี้ใช้เพื่อประมวลผลข้อความจากไฟล์ Excel หลายแผ่น โดยการใช้โมเดล BERT ที่ถูกฝึกมาแล้วเพื่อจำแนกความคิดเห็นเป็นแง่บวก แง่ลบ หรือเป็นกลาง และจำแนกหัวข้อที่เกี่ยวข้องกับความคิดเห็นนั้น ๆ จากนั้นจะบันทึกผลลัพธ์ลงในไฟล์ Excel ใหม่เพื่อให้สามารถใช้งานต่อได้ในอนาคตครับ
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx]

def get_preds(model, tokenizer, texts, device, batch_size=32, max_length=128, label_map=None):
    dataset = TextDataset(texts)
    def collate_fn(batch):
        return tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=max_length
        )
    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False,
        num_workers=0 if os.name == 'nt' else os.cpu_count(),
        pin_memory=(device.type == "cuda"), collate_fn=collate_fn
    )
    preds = []
    model.eval()
    for batch in tqdm(loader, desc="Predicting"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            logits = model(**batch).logits
        pred = logits.argmax(dim=-1).cpu().tolist()
        if label_map is not None:
            pred = [label_map.get(p, str(p)) for p in pred]
        preds.extend(pred)
    return preds

def main():
    #  SET PATH 
    sentiment_ckpt = r"C:\othai\ML_BERT\Model\model_sentiment\checkpoint-1000"
    topic_ckpt     = r"C:\othai\ML_BERT\Model\model_topic\checkpoint-1200"
    input_excel    = r"C:\othai\ML_BERT\ML_Excel\extern11.xlsx"
    output_excel   = r"C:\othai\ML_BERT\ML_Excel\result_dual1.xlsx"

    sentiment_map = {0: "Positive", 1: "Neutral", 2: "Negative"}
    topic_map = {
        0:"Activity", 1:"After-Service", 2:"Appreciation", 3:"Chinese Investors",
        4:"Common Area - Facilities", 5:"Construction Materials", 6:"Design",
        7:"Engaging", 8:"Financial & Branding", 9:"Intention", 10:"Location",
        11:"Pet", 12:"Politics (delete after)", 13:"Price & Promotion", 14:"Quality",
        15:"Security", 16:"Space"
    }

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_ckpt)
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_ckpt).to(device)
    topic_tokenizer = AutoTokenizer.from_pretrained(topic_ckpt)
    topic_model = AutoModelForSequenceClassification.from_pretrained(topic_ckpt).to(device)

    # --- LOAD ALL SHEETS ---
    all_sheets = pd.read_excel(input_excel, sheet_name=None)
    output_sheets = {}

    for sheet_name, df in all_sheets.items():
        if "message" not in df.columns:
            print(f"⚠️  Sheet {sheet_name} ไม่มีคอลัมน์ 'message' ข้าม sheet นี้")
            continue
        df["message"] = df["message"].fillna("").astype(str)
        texts = df["message"].tolist()
        sent_preds = get_preds(sentiment_model, sentiment_tokenizer, texts, device, label_map=sentiment_map)
        topic_preds = get_preds(topic_model, topic_tokenizer, texts, device, label_map=topic_map)
        df["sentiment_pred"] = sent_preds
        df["topic_pred"] = topic_preds
        output_sheets[sheet_name] = df

    # --- EXPORT ---
    with pd.ExcelWriter(output_excel) as writer:
        for sheet_name, df in output_sheets.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    print(f"Exported all sheets to {output_excel}")

if __name__ == "__main__":
    main()
