<a href="https://colab.research.google.com/github/GabeRichmond/tagalog-bert-comparative-analysis/blob/main/Data_Pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Data Pre-processing Notebook**

*An Analysis of Quantized and Non-Quantized Tagalog BERT Model Performance using Benchmark Datasets*

Group NRS
*   NGO, Gabriel Richmond R.
*   REYES, Aramis Faye D.
*   SANTIAGO, Spencer Ivan S.

#### **Imports**

In [None]:
# Google Drive
from google.colab import drive
drive.mount('/content/drive/')

# Huggingface Datasets
!pip install datasets
import datasets
from datasets import Dataset, metric

# Huggingface Transformers
!pip install transformers
import transformers
from transformers import BertTokenizer

# Pandas
import pandas as pd

# Regular Expression
import re

Mounted at /content/drive/
Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggin

In [None]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('jcblaise/bert-tagalog-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/256k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [None]:
# Warnings (to disable)
import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None

transformers.logging.set_verbosity_error()

### **Select and Load Datasets**


*   Select datasets to pre-process one-by-one, not at the same time. There should not be more than one 'True' below.
*   You may switch between datasets without having to restart the runtime by changing these values after data pre-processing is finished with one dataset.


In [None]:
hatespeech = True      # Set to 'True' for Hate Speech Dataset
dengue = False          # Set to 'True' for Dengue Dataset
newsph = False          # Set to 'True' for NewsPH-NLI Dataset

In [None]:
if hatespeech == True:
  train_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/hatespeech/datasets/raw/train.csv', lineterminator = '\n')            # Training (raw)
  val_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/hatespeech/datasets/raw/valid.csv', lineterminator = '\n')              # Validation (raw)
  test_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/hatespeech/datasets/raw/test.csv', lineterminator = '\n')              # Testing (raw)

  print("Hate Speech Dataset Loaded Successfully")

elif dengue == True:
  train_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/dengue/datasets/raw/train.csv', lineterminator = '\n')                # Training (raw)
  val_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/dengue/datasets/raw/valid.csv', lineterminator = '\n')                  # Validation (raw)
  test_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/dengue/datasets/raw/test.csv', lineterminator = '\n')                  # Testing (raw)

  print("Dengue Dataset Loaded Successfully")

elif newsph == True:
  train_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/newsph/datasets/raw/train.csv', lineterminator = '\n', index_col = 0)  # Training (raw)
  val_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/newsph/datasets/raw/valid.csv', lineterminator = '\n', index_col = 0)    # Validation (raw)
  test_df_raw = pd.read_csv('/content/drive/MyDrive/BERT/newsph/datasets/raw/test.csv', lineterminator = '\n', index_col = 0)    # Testing (raw)

  print("NewsPH-NLI Dataset Loaded Successfully")

else:
  raise Exception("No Dataset Selected")

Hate Speech Dataset Loaded Successfully


In [None]:
train_df_raw

Unnamed: 0,text,label
0,GASTOS NI VP BINAY SA POLITICAL ADS HALOS P7-M...,0
1,Mar Roxas TANG INA TUWID NA DAAN DAW .. EH SYA...,1
2,Salamat sa walang sawang suporta ng mga taga m...,0
3,@rapplerdotcom putangina mo binay TAKBO PA,1
4,"Binay with selective amnesia, forgetting about...",0
...,...,...
9995,"Mar Roxas on the rise, w/ momentum, machinery,...",0
9996,Nog nog \rPandak\rLaki sa hirap \rYan si Binay...,1
9997,"Alan Cayetano 'confirms' Palace, Roxas, Poe be...",0
9998,Mas nakakainis ad ni Mar kaysa kay Binay.,1


### **Data Pre-processing**

In [None]:
# Specify Parts to Remove
urls = re.compile(r"https?://\S+")                                # URLs (e.g. https://google.com.ph/, https://bit.ly/, etc.)
retweets = re.compile(r'^RT[\s]+')                                # Retweet Indicators (RT)
mentions = re.compile(r"^@\S+|\s@\S+")                            # Mentions/Handles (e.g. @MapuaUniv, @govph, etc.)
punctuations = re.compile(r'[^\w\s]')                             # Punctuations (e.g. #, ?, !, etc.)

# Removal Function for URLs
def match_expr(pattern, string):
  return not pattern.search(string) == None

if newsph == True:
  def get_data_wo_urls(dataset):
      link_with_urls = dataset.apply(lambda x: match_expr(urls, x))
      return dataset[[not e for e in link_with_urls]]
else:
  def get_data_wo_urls(dataset):
      link_with_urls = dataset.text.apply(lambda x: match_expr(urls, x))
      return dataset[[not e for e in link_with_urls]]

# Removal Function for Retweet Indicators, Mentions/Handles, and Punctuations (including hashtag signs)
def process_text(text):
  text = retweets.sub('', text)
  text = mentions.sub('', text)
  text = punctuations.sub('',text)

  # Return as Uncased Text
  return text.strip().lower()

#### Hate Speech

In [None]:
# Removal of Unnecessary Text
if hatespeech == True:
  # Training Dataset
  train_df_raw = get_data_wo_urls(train_df_raw)
  train_df_raw.text = train_df_raw.text.apply(process_text)

  # Validation Dataset
  val_df_raw = get_data_wo_urls(val_df_raw)
  val_df_raw.text = val_df_raw.text.apply(process_text)

  # Testing Dataset
  test_df_raw = get_data_wo_urls(test_df_raw)
  test_df_raw.text = test_df_raw.text.apply(process_text)

#### Dengue

In [None]:
# Removal of Unnecessary Text
if dengue == True:
  # Training Dataset
  train_df_raw = get_data_wo_urls(train_df_raw)
  train_df_raw.text = train_df_raw.text.apply(process_text)

    # for QAT
  train_df_raw_qat = get_data_wo_urls(train_df_raw)
  train_df_raw_qat.text = train_df_raw_qat.text.apply(process_text)


  # Validation Dataset
  val_df_raw = get_data_wo_urls(val_df_raw)
  val_df_raw.text = val_df_raw.text.apply(process_text)

    # for QAT
  val_df_raw_qat = get_data_wo_urls(val_df_raw)
  val_df_raw_qat.text = val_df_raw_qat.text.apply(process_text)


  # Testing Dataset
  test_df_raw = get_data_wo_urls(test_df_raw)
  test_df_raw.text = test_df_raw.text.apply(process_text)

    # for QAT
  test_df_raw_qat = get_data_wo_urls(test_df_raw)
  test_df_raw_qat.text = test_df_raw_qat.text.apply(process_text)

In [None]:
# Convert Label Datatypes to Float (for QAT)
if dengue == True:
  convert_dict = {'absent': float,
                  'dengue': float,
                  'health': float,
                  'mosquito': float,
                  'sick': float
                 }

  # Training Dataset
  train_df_raw_qat = train_df_raw_qat.astype(convert_dict)

  # Validation Dataset
  val_df_raw_qat = val_df_raw_qat.astype(convert_dict)

  # Testing Dataset
  test_df_raw_qat = test_df_raw_qat.astype(convert_dict)

In [None]:
# Convert Labels to List
if dengue == True:
  # Training Dataset
  train_df_raw['labels'] = train_df_raw[['absent', 'dengue', 'health', 'mosquito', 'sick']].values.tolist()
  train_df_raw = train_df_raw.drop(columns = ['absent', 'dengue', 'health', 'mosquito', 'sick'], axis = 1)

    # for QAT
  train_df_raw_qat['labels'] = train_df_raw_qat[['absent', 'dengue', 'health', 'mosquito', 'sick']].values.tolist()
  train_df_raw_qat = train_df_raw_qat.drop(columns = ['absent', 'dengue', 'health', 'mosquito', 'sick'], axis = 1)


  # Validation Dataset
  val_df_raw['labels'] = val_df_raw[['absent', 'dengue', 'health', 'mosquito', 'sick']].values.tolist()
  val_df_raw = val_df_raw.drop(columns = ['absent', 'dengue', 'health', 'mosquito', 'sick'], axis = 1)

    # for QAT
  val_df_raw_qat['labels'] = val_df_raw_qat[['absent', 'dengue', 'health', 'mosquito', 'sick']].values.tolist()
  val_df_raw_qat = val_df_raw_qat.drop(columns = ['absent', 'dengue', 'health', 'mosquito', 'sick'], axis = 1)

  # Testing Dataset
  test_df_raw['labels'] = test_df_raw[['absent', 'dengue', 'health', 'mosquito', 'sick']].values.tolist()
  test_df_raw = test_df_raw.drop(columns = ['absent', 'dengue', 'health', 'mosquito', 'sick'], axis = 1)

    # for QAT
  test_df_raw_qat['labels'] = test_df_raw_qat[['absent', 'dengue', 'health', 'mosquito', 'sick']].values.tolist()
  test_df_raw_qat = test_df_raw_qat.drop(columns = ['absent', 'dengue', 'health', 'mosquito', 'sick'], axis = 1)

#### NewsPH-NLI

In [None]:
# Removal of Unnecessary Text
if newsph == True:
  # Training Dataset
  train_df_raw.s1 = get_data_wo_urls(train_df_raw.s1)
  train_df_raw = train_df_raw.dropna()
  train_df_raw.s1 = train_df_raw.s1.apply(process_text)

  train_df_raw.s2 = get_data_wo_urls(train_df_raw.s2)
  train_df_raw = train_df_raw.dropna()
  train_df_raw.s2 = train_df_raw.s2.apply(process_text)

  # Validation Dataset
  val_df_raw.s1 = get_data_wo_urls(val_df_raw.s1)
  val_df_raw = val_df_raw.dropna()
  val_df_raw.s1 = val_df_raw.s1.apply(process_text)

  val_df_raw.s2 = get_data_wo_urls(val_df_raw.s2)
  val_df_raw = val_df_raw.dropna()
  val_df_raw.s2 = val_df_raw.s2.apply(process_text)

  # Testing Dataset
  test_df_raw.s1 = get_data_wo_urls(test_df_raw.s1)
  test_df_raw = test_df_raw.dropna()
  test_df_raw.s1 = test_df_raw.s1.apply(process_text)

  test_df_raw.s2 = get_data_wo_urls(test_df_raw.s2)
  test_df_raw = test_df_raw.dropna()
  test_df_raw.s2 = test_df_raw.s2.apply(process_text)

### **Save Datasets**

In [None]:
# Removal of N/A Entries

# Training Dataset
train_df = train_df_raw.dropna()
train_df = train_df.reset_index(drop = True)

  # for Dengue (QAT)
if dengue == True:
    train_df_qat = train_df_raw_qat.dropna()
    train_df_qat = train_df_qat.reset_index(drop = True)


# Validation Dataset
val_df = val_df_raw.dropna()
val_df = val_df.reset_index(drop = True)

  # for Dengue (QAT)
if dengue == True:
    val_df_qat = val_df_raw_qat.dropna()
    val_df_qat = val_df_qat.reset_index(drop = True)


# Testing Dataset
test_df = test_df_raw.dropna()
test_df = test_df.reset_index(drop = True)

  # for Dengue (QAT)
if dengue == True:
    test_df_qat = test_df_raw_qat.dropna()
    test_df_qat = test_df_qat.reset_index(drop = True)

In [None]:
train_df

Unnamed: 0,text,label
0,mar roxas tang ina tuwid na daan daw eh sya n...,1
1,putangina mo binay takbo pa,1
2,binay with selective amnesia forgetting about ...,0
3,it doesnt matter whoever won between duterte a...,0
4,nognog pero nognog din ang nag malasakit wtf t...,1
...,...,...
7297,yesterday i saw a sunny day song ikenai borde...,0
7298,kaninang pa itong binay binay binaytch,1
7299,nog nog \rpandak\rlaki sa hirap \ryan si binay,1
7300,mas nakakainis ad ni mar kaysa kay binay,1


#### as Individual .csv Files

In [None]:
# Save Pre-processed Datasets
if hatespeech == True:
  train_df.to_csv('/content/drive/MyDrive/BERT/hatespeech/datasets/train.csv')
  val_df.to_csv('/content/drive/MyDrive/BERT/hatespeech/datasets/valid.csv')
  test_df.to_csv('/content/drive/MyDrive/BERT/hatespeech/datasets/test.csv')

  print("Pre-processed Hate Speech Dataset Successfully Saved as Individual .csv Files")

elif dengue == True:
  train_df.to_csv('/content/drive/MyDrive/BERT/dengue/datasets/train.csv')
  val_df.to_csv('/content/drive/MyDrive/BERT/dengue/datasets/valid.csv')
  test_df.to_csv('/content/drive/MyDrive/BERT/dengue/datasets/test.csv')

  print("Pre-processed Dengue Dataset Successfully Saved as Individual .csv Files")

elif newsph == True:
  train_df.to_csv('/content/drive/MyDrive/BERT/newsph/datasets/train.csv')
  val_df.to_csv('/content/drive/MyDrive/BERT/newsph/datasets/valid.csv')
  test_df.to_csv('/content/drive/MyDrive/BERT/newsph/datasets/test.csv')

  print("Pre-processed NewsPH-NLI Dataset Successfully Saved as Individual .csv Files")

else:
  raise Exception

Pre-processed Hate Speech Dataset Successfully Saved as Individual .csv Files


#### as a Dataset Dictionary

**NOTE:** The datasets are also tokenized during this process.

In [None]:
# Convert DataFrames to Huggingface Datasets
if dengue == False:
  train_ds = Dataset.from_pandas(train_df, split = "train")
  test_ds = Dataset.from_pandas(test_df, split = "test")
  val_ds = Dataset.from_pandas(val_df, split = "val")

else:
  train_ds = Dataset.from_pandas(train_df_qat, split = "train")
  test_ds = Dataset.from_pandas(test_df_qat, split = "test")
  val_ds = Dataset.from_pandas(val_df_qat, split = "val")

dataDict = datasets.DatasetDict({"train": train_ds, "test": test_ds, "val": val_ds})

In [None]:
# Tokenizer Function
max_seq_length = min(128, tokenizer.model_max_length)
padding = "max_length"

def preprocess_function(examples):
    args = ((examples["text"],) if newsph is False else (examples["s1"], examples["s2"]))
    return tokenizer(*args, padding = padding, max_length = max_seq_length, truncation = True)

In [None]:
# Tokenization
dataDict = dataDict.map(preprocess_function, batched = True)

Map:   0%|          | 0/7302 [00:00<?, ? examples/s]

Map:   0%|          | 0/3058 [00:00<?, ? examples/s]

Map:   0%|          | 0/3056 [00:00<?, ? examples/s]

In [None]:
print(dataDict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7302
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3058
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3056
    })
})


In [None]:
# Save Pre-processed Datasets
if hatespeech == True:
  dataDict.save_to_disk('/content/drive/MyDrive/BERT/hatespeech/datasets/dataDict')
  print("Pre-processed Hate Speech Dataset Successfully Saved as a Dataset Dictionary")

elif dengue == True:
  dataDict.save_to_disk('/content/drive/MyDrive/BERT/dengue/datasets/dataDict')
  print("Pre-processed Dengue Dataset Successfully Saved as a Dataset Dictionary")

elif newsph == True:
  dataDict.save_to_disk('/content/drive/MyDrive/BERT/newsph/datasets/dataDict')
  print("Pre-processed NewsPH-NLI Dataset Successfully Saved as a Dataset Dictionary")

else:
  raise Exception

Saving the dataset (0/1 shards):   0%|          | 0/7302 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3058 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3056 [00:00<?, ? examples/s]

Pre-processed Hate Speech Dataset Successfully Saved as a Dataset Dictionary


HS: 7302 - 3058 - 3056

D: 3100 - 337 - 422

NP: 7000 - 2000 - 1000