# **Prepare Data**

In [None]:
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoConfig,

    Trainer,
    TrainingArguments,
)

from datasets import (
    Dataset,
    DatasetDict,
    Features, Sequence, ClassLabel, Value
)
import zipfile
from sklearn.model_selection import train_test_split

In [2]:
# zipfile.ZipFile("test.tsv.zip").extractall("test")
# zipfile.ZipFile("train.tsv.zip").extractall("train")

In [3]:
train_df = pd.read_csv("train/train.tsv", sep="\t")
test_df = pd.read_csv("test/test.tsv", sep="\t")
sub_df = pd.read_csv("sampleSubmission.csv")

# **Explore Data**

In [4]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [6]:
train_df.shape,test_df.shape

((156060, 4), (66292, 3))

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [None]:
train_df['Sentiment'].value_counts()

5

In [None]:
test_df.fillna('', inplace=True)

# **Model**

In [None]:
model_name = "microsoft/deberta-v3-base"
model_name = "clicknext/phayathaibert"
config = AutoConfig.from_pretrained(model_name,num_labels=5)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name,config=config)

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.26M [00:00<?, ?B/s]

In [None]:
train_split, dev_split = train_test_split(train_df, test_size=0.2, random_state=42)
print(f'Train set size: {len(train_split)}')
print(f'Dev set size: {len(dev_split)}')

In [None]:
features = Features({
    "Phrase": Value("string"),
    "Sentiment": ClassLabel(num_classes=5)
})

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_split[['Phrase', 'Sentiment']], features=features, preserve_index=False),
    'dev':   Dataset.from_pandas(dev_split[['Phrase', 'Sentiment']], features=features, preserve_index=False),
    'test':  Dataset.from_pandas(test_df[['PhraseId', 'Phrase']], preserve_index=False) 
})
dataset

In [None]:
def tokenize(examples):
    return tokenizer(examples['Phrase'], truncation=True,max_length=128)
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

In [None]:
data_collator = DataCollatorWithPadding(tokenizer,padding=True, pad_to_multiple_of=8)
tokenized_dataset["train"].column_names

In [None]:
tokenized_dataset['dev'][20]

In [None]:
args = TrainingArguments(
    output_dir='prachatai-headline-2',
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    fp16=True, 
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    eval_strategy='steps',
    eval_steps=500,
    logging_steps=50,
    save_strategy='steps',
    report_to="none"
)

def compute_metrics(eval_preds):
  metric = evaluate.load("accuracy")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["dev"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()