# Import libraries

In [None]:
!pip install simpletransformers
!pip freeze | grep simpletransformers
!git clone https://github.com/nguyenvulebinh/vietnamese-electra

Collecting simpletransformers
  Downloading simpletransformers-0.70.0-py3-none-any.whl (315 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/315.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m307.2/315.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpl

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

import string
import os
import torch
import transformers as ppb
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import ElectraTokenizerFast
import urllib.request
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Preparing data

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username":"thnhnguyenphuc","key":"c4567d6a802976fa686fb1f293c5ce5f"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# https://technowhisp.com/kaggle-api-python-documentation/
# Authenticate to Kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

# Download the dataset
api.dataset_download_files('duyminhnguyentran/csc15105', unzip=True)

In [None]:
file_path = "Project1_Data.json"

with open(file_path, "r", encoding="utf8") as f:
    data = json.load(f)

df = pd.DataFrame(data)

df.head()

Unnamed: 0,id,question,title,text,label
0,u7-1570446247_1,Quang Hải giành được chức vô địch U21 quốc gia...,Nguyễn Quang Hải (sinh 1997),"Năm 2013 , Nguyễn Quang Hải giành chức vô địch...",True
1,u7-1570446247_2,Quang Hải giành được chức vô địch U21 quốc gia...,Nguyễn Quang Hải (sinh 1997),"Sau chức vô địch U-21 quốc gia 2013 , Nguyễn Q...",True
2,u7-1570446247_0,Quang Hải giành được chức vô địch U21 quốc gia...,Nguyễn Quang Hải (sinh 1997),Anh bắt đầu gia nhập lò đào tạo trẻ Hà Nội T&T...,False
3,u7-1570446247_3,Quang Hải giành được chức vô địch U21 quốc gia...,Nguyễn Quang Hải (sinh 1997),"Năm 2011 , Nguyễn Quang Hải mới 14 tuổi được g...",False
4,u7-1570445661_0,Mỗi hiệp bóng đá kéo dài bao lâu,Bóng đá,Một trận đấu bóng đá thông thường có hai hiệp ...,True


In [None]:
df = df.sample(frac=1).reset_index(drop=True)

# Remove stopwords

In [None]:
nltk_stopwords_dir = "/root/nltk_data/corpora/stopwords"
stopwords_url = "https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords.txt"


# Function to download file from URL
def download_file(url, save_path):
    try:
        urllib.request.urlretrieve(url, save_path)
        print("File downloaded successfully.")
    except Exception as e:
        print("Error downloading file:", e)

def convert_to_nltk_stopwords(txt_file_path, nltk_stopwords_path):
    try:
        with open(txt_file_path, 'r', encoding='utf-8') as f:
            words = [line.strip() for line in f if line.strip()]

        with open(nltk_stopwords_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(words))

        print("Stop words file converted and saved successfully.")
    except Exception as e:
        print("Error converting and saving stop words file:", e)

# Download stop words txt file
stopwords_file_path = os.path.join(nltk_stopwords_dir, "vietnamese-stopwords.txt")
download_file(stopwords_url, stopwords_file_path)

nltk_stopwords_path = os.path.join(nltk_stopwords_dir, "vietnamese")
convert_to_nltk_stopwords(stopwords_file_path, nltk_stopwords_path)


File downloaded successfully.
Stop words file converted and saved successfully.


In [None]:
stop_words = set(stopwords.words('vietnamese'))

def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.lower() not in stop_words and word not in string.punctuation]
    return ' '.join(words)

preprocessed_data = []
for entry in data:
    preprocessed_entry = {
        'id': entry['id'],
        'question': preprocess_text(entry['question']),
        'title': preprocess_text(entry['title']),
        'text': preprocess_text(entry['text']),
        'label': entry['label']
    }
    preprocessed_data.append(preprocessed_entry)

df = pd.DataFrame(preprocessed_data)

df.head()

Unnamed: 0,id,question,title,text,label
0,u7-1570446247_1,quang hải giành chức vô địch u21 quốc gia bao ...,nguyễn quang hải sinh 1997,2013 nguyễn quang hải giành chức vô địch u21 q...,True
1,u7-1570446247_2,quang hải giành chức vô địch u21 quốc gia bao ...,nguyễn quang hải sinh 1997,chức vô địch u-21 quốc gia 2013 nguyễn quang h...,True
2,u7-1570446247_0,quang hải giành chức vô địch u21 quốc gia bao ...,nguyễn quang hải sinh 1997,bắt đầu gia nhập lò đào trẻ hà nội t t 9 2006,False
3,u7-1570446247_3,quang hải giành chức vô địch u21 quốc gia bao ...,nguyễn quang hải sinh 1997,2011 nguyễn quang hải 14 gọi đội tuyển u-16 vi...,False
4,u7-1570445661_0,hiệp bóng đá kéo bao,bóng đá,trận đấu bóng đá thông hai hiệp hiệp 45 phút t...,True


#Train test split

In [None]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train, test = train_test_split(df, test_size=1 - train_ratio, random_state=42)
val, test = train_test_split(test, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train['label'])
val_labels = label_encoder.transform(val['label'])
test_labels = label_encoder.transform(test['label'])

train_concatenated_data = train['question'] + " " + train['text']
val_concatenated_data = val['question'] + " " + val['text']
test_concatenated_data = test['question'] + " " + test['text']

train_df = pd.DataFrame({'concatenated_data': train_concatenated_data, 'label': train_labels})
val_df = pd.DataFrame({'concatenated_data': val_concatenated_data, 'label': val_labels})
test_df = pd.DataFrame({'concatenated_data': test_concatenated_data, 'label': test_labels})

# Classification using Electra

In [None]:
# define hyperparameter
train_args = {
    "reprocess_input_data": True,
    "fp16": False,
    "num_train_epochs": 10,
    "learning_rate": 2e-5,
    "train_batch_size": 16,
    "eval_batch_size": 32,
    "gradient_accumulation_steps": 4,
    "warmup_steps": 1000,
    "weight_decay": 0.01,
    "adam_epsilon": 1e-8,
    "max_seq_length": 128,
    "overwrite_output_dir": True,
    "save_steps": 1000,
    "save_model_every_epoch": False,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 1000,
    "evaluate_during_training_verbose": True,
    "use_cached_eval_features": False,
    "no_cache": False,
    "logging_steps": 500,
    "evaluate_each_epoch": True,
    "early_stopping_patience": 3,
    "early_stopping_delta": 0.01,
}

# Create a ClassificationModel
model = ClassificationModel(
    "electra",
    "/content/vietnamese-electra/model_pretrained/dis",
    from_tf=True,
    args=train_args,
    use_cuda=True,
    tokenizer_name="google/electra-base-discriminator",
    weight = [0.68208, 0.3172]
)

All TF 2.0 model weights were used when initializing ElectraForSequenceClassification.

All the weights of ElectraForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElectraForSequenceClassification for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

In [None]:
model.train_model(
    train_df,
    eval_df=val_df,
    show_running_loss=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 10 of 10:   0%|          | 0/793 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

(1980,
 defaultdict(list,
             {'global_step': [198,
               396,
               594,
               792,
               990,
               1000,
               1188,
               1386,
               1584,
               1782,
               1980],
              'train_loss': [0.6322136521339417,
               0.7511555552482605,
               0.5026071667671204,
               0.9250278472900391,
               0.19448396563529968,
               0.5091623067855835,
               0.1877100020647049,
               0.5663117170333862,
               0.3169073462486267,
               0.2506828010082245,
               0.39292919635772705],
              'mcc': [0.0,
               0.0,
               0.0,
               0.0,
               0.0,
               0.0,
               0.0,
               0.238647483562174,
               0.3255596044837877,
               0.31557871100943924,
               0.31684877444989346],
              'accuracy': [0.672312223858

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test_df, verbose=True, silent=True)
print(result)

  0%|          | 0/5 [00:00<?, ?it/s]

{'mcc': 0.3188702179271621, 'accuracy': 0.7394184762605815, 'f1_score': 0.4485981308411215, 'tp': 288, 'tn': 1721, 'fp': 168, 'fn': 540, 'auroc': 0.7240152753162857, 'auprc': 0.5456864416743579, 'eval_loss': 0.4243437539128696}
