In [33]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
plt.rc('font', family='Tahoma')  
plt.rcParams['axes.unicode_minus'] = False  

In [34]:
df = pd.read_csv('dataset_main_15000.csv', sep=',', engine='python', names=['Comment', 'Sentiment'], skiprows=1)


In [35]:
df_selected = df[['Comment','Sentiment']]

In [36]:
print(df_selected.head)

<bound method NDFrame.head of                                                  Comment Sentiment
0      --''  เธอดีนะแค่ 4 ปี ของเราแอบชอบรุ่นพี่ ห่าง...       pos
1       #ตุ๊กตาชานมไข่มุก #น่ารักน่ากินงั่มมม เรามาถึ...       pos
2      "ครูพลคงไม่ชินกับหน้าธรรมชาติของอินนะค่ะ อินต่...       pos
3      "รู้สึกภูมิใจกับสิ่งใหม่ที่ดีกว่าที่สุด" นี้แห...       pos
4      #2019 ดูไปร้อยรอบแล้วมั้งแต่ยังฟินอยู่เลยอ่าทำ...       pos
...                                                  ...       ...
14995  เฮียครับแฟนผมหัดถอยหลังเข้าซองบ่อยจะไปทำใบขับข...       neg
14996  แฮงค์หวะ บอกแล้วกินเบียร์ช้างบวกอากาศร้อน นี่จ...       neg
14997                  โฮกาเดนคนเดียวสี่ อิเหี้ยยยยยยยยย       neg
14998  ไฮเนเก้น ปรับราคาลงเหมือนบุหรี่บ้างดิวะ เอาราค...       neg
14999                          ไฮเนเก้นท์ ถูกกินแล้วขมคอ       neg

[15000 rows x 2 columns]>


In [37]:
df = df.dropna(subset=['Comment', 'Sentiment'])


In [38]:
df['Sentiment'] = df['Sentiment'].str.lower().str.strip()


In [39]:
sentiment_counts = df_selected['Sentiment'].value_counts()
print(df['Sentiment'].value_counts())

Sentiment
pos    5000
neu    5000
neg    5000
Name: count, dtype: int64


In [40]:
print(df_selected['Sentiment'].unique())


['pos' 'neu' 'neg']


In [41]:
df_selected = df[['Comment', 'Sentiment']].dropna()
df_selected['Sentiment'] = df_selected['Sentiment'].str.lower().str.strip()

label2id = {'neg': 0, 'neu': 1, 'pos': 2}
df_selected['label'] = df_selected['Sentiment'].map(label2id)
df_selected = df_selected[df_selected['label'].notnull()]


In [42]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_selected['Comment'].tolist(),
    df_selected['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_selected['label']
)


In [43]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [44]:
import torch

class ThaiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {'labels': torch.tensor(self.labels[idx])}

train_dataset = ThaiDataset(train_encodings, train_labels)
val_dataset = ThaiDataset(val_encodings, val_labels)

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "airesearch/wangchanberta-base-att-spm-uncased",
    num_labels=3
)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
from transformers import Trainer,TrainingArguments
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6319,0.361804,0.873333
2,0.2925,0.315043,0.880333
3,0.1608,0.508062,0.880667
4,0.0696,0.625407,0.876
5,0.0341,0.677707,0.878333


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=3750, training_loss=0.22145108947753905, metrics={'train_runtime': 1479.6989, 'train_samples_per_second': 40.549, 'train_steps_per_second': 2.534, 'total_flos': 3946701265920000.0, 'train_loss': 0.22145108947753905, 'epoch': 5.0})

In [46]:
results = trainer.evaluate()
print("ผลลัพธ์จาก validation set:", results)

  return forward_call(*args, **kwargs)


ผลลัพธ์จาก validation set: {'eval_loss': 0.6777074337005615, 'eval_accuracy': 0.8783333333333333, 'eval_runtime': 19.8294, 'eval_samples_per_second': 151.29, 'eval_steps_per_second': 9.481, 'epoch': 5.0}


In [49]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# โหลด tokenizer และโมเดลจาก checkpoint ที่เทรนเสร็จ
checkpoint_path = "./results/checkpoint-3750"  # เปลี่ยนตรงนี้ถ้าอยากใช้ checkpoint อื่น
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
model = AutoModelForSequenceClassification.from_pretrained("./results/checkpoint-3750")

# ฟังก์ชันทำนาย
def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    label_map = {0: 'neg', 1: 'neu', 2: 'pos'}
    return label_map[predicted_class]

# ใช้งาน
text = "บริการดีมากๆเลย"
print("ข้อความ:", text)
print("ผลการวิเคราะห์:", predict_sentiment(text))


ข้อความ: บริการดีมากๆเลย
ผลการวิเคราะห์: pos


  return forward_call(*args, **kwargs)
