Title - FAQ Categorizer

Text Classification by Fine-tuning Language Model

Section - 1: Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install opencv-python



In [None]:
!pip install simpletransformers
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from simpletransformers.classification import ClassificationModel, ClassificationArgs

data = pd.read_csv('/content/drive/MyDrive/NLP_MINI/nlp_faq_dataset_cleaned.csv')


Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets-

In [None]:
print("Dataset Info:")
print(data.info())
print("\nClass Distribution:")
print(data['Labels'].value_counts())

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024 entries, 0 to 1023
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Questions  1024 non-null   object
 1   Labels     1024 non-null   object
dtypes: object(2)
memory usage: 16.1+ KB
None

Class Distribution:
Labels
General Inquiry         128
Account Management      128
Payment Issues          128
Troubleshooting         128
Subscription Queries    128
Technical Support       128
Security & Privacy      128
Product Information     128
Name: count, dtype: int64


Section - 2: Text Processing

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

train_df = pd.DataFrame({
    'text': train_data['Questions'].apply(clean_text),
    'labels': train_data['Labels']
})

val_df = pd.DataFrame({
    'text': val_data['Questions'].apply(clean_text),
    'labels': val_data['Labels']
})

print("\nSample Processed Data:")
print(train_df.head())



Sample Processed Data:
                                                  text              labels
137                where can i find my account history  Account Management
377                  where can i find proof of payment      Payment Issues
388         why cant i log in with my correct password     Troubleshooting
824         how do you secure remote access to systems  Security & Privacy
767  how do i resolve problems with ssl certificate...   Technical Support


Section - 3: Text Embedding using BERT and RoBERTa

In [None]:
bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=2, use_cuda=False)

roberta_model = ClassificationModel('roberta', 'roberta-base', num_labels=2, use_cuda=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Section - 4: Model Training with BERT and RoBERTa

In [None]:
import shutil
shutil.rmtree("outputs", ignore_errors=True)

In [None]:
!rm -rf outputs/

In [None]:
label_encoder = LabelEncoder()
train_df['labels'] = label_encoder.fit_transform(train_df['labels'])
val_df['labels'] = label_encoder.transform(val_df['labels'])
num_labels = len(label_encoder.classes_)

bert_args = ClassificationArgs(
    overwrite_output_dir=True,
    output_dir="outputs_bert"
)

roberta_args = ClassificationArgs(
    overwrite_output_dir=True,
    output_dir="outputs_roberta"
)

bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=num_labels, args=bert_args, use_cuda=False)
roberta_model = ClassificationModel('roberta', 'roberta-base', num_labels=num_labels, args=roberta_args, use_cuda=False)

bert_model.train_model(train_df)
roberta_model.train_model(train_df)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/103 [00:00<?, ?it/s]

(103, 1.5923569867911849)

In [None]:
bert_args_hp = ClassificationArgs(
    num_train_epochs=3,
    train_batch_size=8,
    eval_batch_size=8,
    learning_rate=3e-5,
    max_seq_length=128,
    weight_decay=0.01,
    warmup_steps=0,
    logging_steps=50,
    save_steps=200,
    overwrite_output_dir=True,
    output_dir="outputs_bert_hp"
)

roberta_args_hp = ClassificationArgs(
    num_train_epochs=3,
    train_batch_size=8,
    eval_batch_size=8,
    learning_rate=3e-5,
    max_seq_length=128,
    weight_decay=0.01,
    warmup_steps=0,
    logging_steps=50,
    save_steps=200,
    overwrite_output_dir=True,
    output_dir="outputs_roberta_hp"
)
bert_model_hp = ClassificationModel('bert', 'bert-base-uncased', num_labels=num_labels, args=bert_args_hp, use_cuda=False)
roberta_model_hp = ClassificationModel('roberta', 'roberta-base', num_labels=num_labels, args=roberta_args_hp, use_cuda=False)

bert_model_hp.train_model(train_df)
roberta_model_hp.train_model(train_df)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/103 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/103 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/103 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/103 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/103 [00:00<?, ?it/s]

(309, 0.8439472924592426)

Section - 5: Evaluation on Validation Set

In [None]:
result_bert, _, _ = bert_model.eval_model(val_df)
print("\nBERT Evaluation Results (Basic):", result_bert)

result_roberta, _, _ = roberta_model.eval_model(val_df)
print("\nRoBERTa Evaluation Results (Basic):", result_roberta)

result_bert_hp, _, _ = bert_model_hp.eval_model(val_df)
print("\nBERT Evaluation Results (Fine-Tuned):", result_bert_hp)

result_roberta_hp, _, _ = roberta_model_hp.eval_model(val_df)
print("\nRoBERTa Evaluation Results (Fine-Tuned):", result_roberta_hp)

0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]


BERT Evaluation Results (Basic): {'mcc': np.float64(0.54425802359509), 'eval_loss': 1.2428195873896282}


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]


RoBERTa Evaluation Results (Basic): {'mcc': np.float64(0.7507922673309897), 'eval_loss': 0.8442981441815695}


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/26 [00:00<?, ?it/s]


BERT Evaluation Results (Fine-Tuned): {'mcc': np.float64(0.7376736041944651), 'eval_loss': 0.7109730398425689}


0it [00:00, ?it/s]

Running Evaluation:   0%|          | 0/26 [00:00<?, ?it/s]


RoBERTa Evaluation Results (Fine-Tuned): {'mcc': np.float64(0.8169562965145866), 'eval_loss': 0.5591316813459764}


Section - 6: Saving the models

In [None]:
bert_model.save_model('bert_best_model')

roberta_model.save_model('roberta_best_model')

bert_model_hp.save_model('bert_best_model_hp')

roberta_model_hp.save_model('roberta_best_model_hp')

In [None]:
!ls -lhR outputs_bert/
!ls -lhR outputs_roberta/
!ls -lhR outputs_bert_hp/
!ls -lhR outputs_roberta_hp/

outputs_bert/:
total 419M
drwxr-xr-x 2 root root 4.0K Mar 24 17:42 checkpoint-103-epoch-1
-rw-r--r-- 1 root root 1.1K Mar 24 17:42 config.json
-rw-r--r-- 1 root root   54 Mar 24 20:19 eval_results.txt
-rw-r--r-- 1 root root 2.7K Mar 24 17:43 model_args.json
-rw-r--r-- 1 root root 418M Mar 24 17:43 model.safetensors
-rw-r--r-- 1 root root  125 Mar 24 17:43 special_tokens_map.json
-rw-r--r-- 1 root root 1.2K Mar 24 17:43 tokenizer_config.json
-rw-r--r-- 1 root root 695K Mar 24 17:43 tokenizer.json
-rw-r--r-- 1 root root 3.7K Mar 24 17:43 training_args.bin
-rw-r--r-- 1 root root 227K Mar 24 17:43 vocab.txt

outputs_bert/checkpoint-103-epoch-1:
total 1.3G
-rw-r--r-- 1 root root 1.1K Mar 24 17:42 config.json
-rw-r--r-- 1 root root 2.7K Mar 24 17:42 model_args.json
-rw-r--r-- 1 root root 418M Mar 24 17:42 model.safetensors
-rw-r--r-- 1 root root 836M Mar 24 17:42 optimizer.pt
-rw-r--r-- 1 root root 1.1K Mar 24 17:42 scheduler.pt
-rw-r--r-- 1 root root  125 Mar 24 17:42 special_tokens_map.jso

Section - 7: Prediction on Real-World Input

In [None]:
from simpletransformers.classification import ClassificationModel
import os

bert_basic_path = "outputs_bert"
roberta_basic_path = "outputs_roberta"
bert_finetuned_path = "outputs_bert_hp"
roberta_finetuned_path = "outputs_roberta_hp"


for model_path in [bert_basic_path, roberta_basic_path, bert_finetuned_path, roberta_finetuned_path]:
    if not os.path.exists(f"{model_path}/model.safetensors"):
        raise FileNotFoundError(f"Model missing in {model_path}! Train and save it first.")

bert_model_loaded = ClassificationModel("bert", bert_basic_path, use_safetensors=True, use_cuda=False)
roberta_model_loaded = ClassificationModel("roberta", roberta_basic_path, use_safetensors=True, use_cuda=False)

bert_model_loaded_hp = ClassificationModel("bert", bert_finetuned_path, use_safetensors=True, use_cuda=False)
roberta_model_loaded_hp = ClassificationModel("roberta", roberta_finetuned_path, use_safetensors=True, use_cuda=False)

real_world_text = ["How to reset password? I forgot it", "Can I know the support hours"]

predictions_bert, _ = bert_model_loaded.predict(real_world_text)
print(f"\nBERT Predictions (Basic): {predictions_bert}")

predictions_roberta, _ = roberta_model_loaded.predict(real_world_text)
print(f"\nRoBERTa Predictions (Basic): {predictions_roberta}")

predictions_bert_hp, _ = bert_model_loaded_hp.predict(real_world_text)
print(f"\nBERT Predictions (Fine-Tuned): {predictions_bert_hp}")

predictions_roberta_hp, _ = roberta_model_loaded_hp.predict(real_world_text)
print(f"\nRoBERTa Predictions (Fine-Tuned): {predictions_roberta_hp}")

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


BERT Predictions (Basic): [7, 2]


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


RoBERTa Predictions (Basic): [0, 1]


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


BERT Predictions (Fine-Tuned): [7, 1]


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


RoBERTa Predictions (Fine-Tuned): [6, 1]
