In [None]:
!pip install transformers datasets
!pip install emot
!pip install pythainlp

In [None]:
import pandas as pd
import numpy as np
import re
from pythainlp.tokenize import word_tokenize
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from pythainlp.corpus.common import thai_stopwords
import torch
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
import os
pd.options.mode.chained_assignment = None #for disable warning

In [None]:
dt = load_dataset("wisesight_sentiment")

In [None]:
train_dt = dt['train']
test_dt = dt['test']
val_dt = dt['validation']

#change dataframe format
train = pd.DataFrame.from_dict(train_dt)
test = pd.DataFrame.from_dict(test_dt)
val = pd.DataFrame.from_dict(val_dt)

In [None]:
text1=train['texts'].values.tolist()
text2=test['texts'].values.tolist()
text3=val['texts'].values.tolist()
print(len(text1))
print(len(text2))
print(len(text3))
text_list = text1+text2+text3
print('total lenght of list',len(text_list))

In [None]:
sentiment1=train['category'].values.tolist()
sentiment2=test['category'].values.tolist()
sentiment3=val['category'].values.tolist()
print(len(sentiment1))
print(len(sentiment2))
print(len(sentiment3))
sentiment_list = sentiment1+sentiment2+sentiment3
print('total lenght of list',len(sentiment_list))

In [None]:
data = pd.DataFrame(list(zip(text_list, sentiment_list)),columns =['texts', 'category'])

In [None]:
data

In [None]:
# 0 = 'positive'
# 1 = 'neutral'
# 2 = 'negative'
# 3 = 'question'

data['category'].value_counts()

In [None]:
data = data[data['category'] != 3]
data = data[data['category'] != 1]

In [None]:
data['category'] = data['category'].replace(2,1)

In [None]:
data['category'].value_counts()

In [None]:
#Check null value from dataframe
data.isnull().sum()

In [None]:
def remove_japanese(x):
    hiragana = re.compile('[\u3040-\u309F]')
    katakana = re.compile('[\u30A0-\u30FF]')
    CJK = re.compile('[\u4300-\u9faf]')
    
    for i in tqdm(range(len(x))):
        data['tok_texts'].iloc[i] = hiragana.sub('', data['tok_texts'].iloc[i])
        data['tok_texts'].iloc[i] = katakana.sub('', data['tok_texts'].iloc[i])
        data['tok_texts'].iloc[i] = CJK.sub('', data['tok_texts'].iloc[i])

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def remove_stopwords(x):
    thai_stopword = list(thai_stopwords())
    return "".join(token for token in x if token not in thai_stopword)

In [None]:
data['tok_texts'] = data['texts'].apply(lambda x: re.sub(r'http\S+','',x))

data['tok_texts'] = data['tok_texts'].apply(lambda x: re.sub(r'[A-Za-z]+','',x))

data['tok_texts'] = data['tok_texts'].apply(lambda x: re.sub(r'[0-9]+','',x))

data['tok_texts']=data['tok_texts'].str.replace('[^\w\s]','',regex=True)

remove_japanese(data['tok_texts'])

for i in tqdm(range(len(data['tok_texts']))):
    data['tok_texts'].iloc[i] = remove_emoji(data['tok_texts'].iloc[i])

data['tok_texts'].replace('', np.nan, inplace=True)

data.isnull().sum()
data.dropna(inplace = True)

data['tok_texts'] = data['tok_texts'].apply(lambda x: ' '.join(word_tokenize(x)))

for i in tqdm(range(len(data['tok_texts']))):
    data['tok_texts'].iloc[i] = remove_stopwords(data['tok_texts'].iloc[i])

data['tok_texts'] = data['tok_texts'].apply(lambda x: re.sub(" +"," ",x))

#Drop columns that are not required
data.drop(columns=['texts'],axis=0,inplace=True)

In [None]:
data['tok_texts']

In [None]:
data.iloc[50:100]

In [None]:
train = list(data["tok_texts"])
answer = list(data["category"])

In [None]:
print('Length of Data : ',len(train))
print('Length of Answer : ',len(answer))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, answer, test_size=0.2,stratify=answer)

In [None]:
from transformers import BertTokenizer,BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("/kaggle/input/bert-base-multilingual-cased/bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("/kaggle/input/bert-base-multilingual-cased/bert-base-multilingual-cased",num_labels=2)

In [None]:
model = model.to('cuda')

In [None]:
train_encodings = tokenizer(X_train, truncation=True, 
                            padding=True,
                            max_length=512,)
test_encodings = tokenizer(X_val, truncation=True, 
                           padding=True,
                           max_length=512)

In [None]:
len(X_train),len(X_val)

In [None]:
X_train

In [None]:
# Create torch dataset

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(train_encodings, y_train)
val_dataset = Dataset(test_encodings, y_val)

In [None]:
train_dataset[5]

In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred,)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Define Trainer

args = TrainingArguments(
    report_to="none",
    output_dir="output",
    num_train_epochs=15,
    per_device_train_batch_size=16,
    save_steps=4000,
    logging_steps=1000,
    evaluation_strategy = 'epoch'

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
os.environ["WANDB_DISABLED"] = "true"
no_deprecation_warning=True

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained("/kaggle/working/sentiment_custom_model")