In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
#from datasets import Dataset, load_metric
from sklearn.utils import resample
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import evaluate

In [13]:
pip install datasets


Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd

In [5]:
import pandas as pd
import numpy as np
from sklearn.utils import resample  # Importing resample from sklearn.utils
from sklearn.model_selection import train_test_split
from datasets import Dataset  # Importing Dataset from datasets library

In [6]:
precision_metric = evaluate.load("precision")

In [7]:
accuracy = evaluate.load("accuracy")

In [8]:
f1_metric = evaluate.load("f1")

In [9]:
recall_metric = evaluate.load("recall")

In [10]:
df = pd.read_json("hf://datasets/databricks/databricks-dolly-15k/databricks-dolly-15k.jsonl", lines=True)

In [11]:
# Combine the instruction, context, and response columns into a single text column for BERT input
df['text'] = df['instruction'].fillna('') + ' ' + df['context'].fillna('') + ' ' + df['response'].fillna('')
df = df[['text', 'category']]

# Define retrieval and non-retrieval categories
retrieval_categories = ['open_qa', 'brainstorming', 'general_qa', 'creative writing']
non_retrieval_categories = ['closed_qa', 'classification', 'information_extraction', 'summarization']

# Assign binary labels (1 for retrieval, 0 for non-retrieval)
df['label'] = df['category'].apply(lambda x: 1 if x in retrieval_categories else 0)

# Handle class imbalance by oversampling the minority class
df_retrieval = df[df['label'] == 1]
df_non_retrieval = df[df['label'] == 0]

# Oversample the retrieval category to match the size of non-retrieval category
df_retrieval_oversampled = resample(df_retrieval, 
                                    replace=True,    # sample with replacement
                                    n_samples=len(df_non_retrieval),  # match number in majority class
                                    random_state=42)  # reproducible results

# Combine oversampled retrieval with non-retrieval
df_balanced = pd.concat([df_non_retrieval, df_retrieval_oversampled])

# Split the balanced dataset into training and validation sets
train_df, val_df = train_test_split(df_balanced, test_size=0.2, random_state=42)

# Convert the DataFrame to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [12]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set the format of the datasets
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 11699/11699 [00:20<00:00, 584.87 examples/s]
Map: 100%|██████████| 2925/2925 [00:04<00:00, 588.40 examples/s]


In [14]:
import numpy as np
import evaluate

# Load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

# Define the compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)  # Get the predicted class labels
    acc = accuracy_metric.compute(predictions=preds, references=p.label_ids)
    prec = precision_metric.compute(predictions=preds, references=p.label_ids, average='binary')
    rec = recall_metric.compute(predictions=preds, references=p.label_ids, average='binary')
    f1 = f1_metric.compute(predictions=preds, references=p.label_ids, average='binary')
    
    return {
        'accuracy': acc['accuracy'],
        'precision': prec['precision'],
        'recall': rec['recall'],
        'f1': f1['f1']
    }


In [15]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2643,0.187985,0.946667,0.936601,0.960456,0.948379
2,0.1737,0.182561,0.95453,0.936977,0.976542,0.956351
3,0.115,0.181834,0.953504,0.933504,0.978552,0.955497


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json')