In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [2]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [3]:
import pandas as pd
import transformers
import torch
from datasets import load_dataset
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score,confusion_matrix

### BERT Empathetic

In [4]:
# Load pre-trained BERT model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [29]:
dataset = load_dataset("empathetic_dialogues")


In [30]:
train = pd.DataFrame(dataset['train'])
validation = pd.DataFrame(dataset['validation'])
test = pd.DataFrame(dataset['test'])

# if you want to train faster with less data, if you do this, then change the num_labels=2
# train = train[train['context'].isin(['sentimental','surprised'])].sample(100).reset_index(drop = True)
# validation = validation[validation['context'].isin(['sentimental','surprised'])].sample(20).reset_index(drop = True)
# test = test[test['context'].isin(['sentimental','surprised'])].sample(20).reset_index(drop = True)

train.shape,validation.shape,test.shape

((76673, 8), (12030, 8), (10943, 8))

In [31]:
train['context'].nunique(),validation['context'].nunique(), test['context'].nunique()

(32, 32, 32)

In [32]:
train["text"] = train['prompt']+" [SEP] "+ train['utterance']
validation["text"] = validation['prompt']+" [SEP] "+ validation['utterance']
test["text"] = test['prompt']+" [SEP] "+ test['utterance']

In [33]:
# Create a label (category) encoder object
le = LabelEncoder()

le.fit(train["context"])

train_labels = le.transform(train["context"])
validation_labels = le.transform(validation["context"])
test_labels = le.transform(test["context"])

In [34]:
train["context"].unique()

array(['sentimental', 'afraid', 'proud', 'faithful', 'terrified',
       'joyful', 'angry', 'sad', 'jealous', 'grateful', 'prepared',
       'embarrassed', 'excited', 'annoyed', 'lonely', 'ashamed', 'guilty',
       'surprised', 'nostalgic', 'confident', 'furious', 'disappointed',
       'caring', 'trusting', 'disgusted', 'anticipating', 'anxious',
       'hopeful', 'content', 'impressed', 'apprehensive', 'devastated'],
      dtype=object)

In [35]:
class EMDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.long()  # Convert labels to Long

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Tokenize train data
train_encodings = tokenizer(train["text"].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
train_dataset = EMDataset(train_encodings, torch.tensor(train_labels).long())

# Tokenize validation data
validation_encodings = tokenizer(validation["text"].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
validation_dataset = EMDataset(validation_encodings, torch.tensor(validation_labels).long())

# Tokenize test data
test_encodings = tokenizer(test["text"].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_dataset = EMDataset(test_encodings, torch.tensor(test_labels).long())


In [36]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=32)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
training_args = TrainingArguments(
    output_dir='./BERT_empathetic_dialogues',
    num_train_epochs=2,#3,
    per_device_train_batch_size=2,#16,
    warmup_steps=10, #500,
    weight_decay= 0.1, #0.01,
    logging_dir='./logs',
    # push_to_hub=True,
    evaluation_strategy="steps" # or "epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset  # Add the validation dataset here
)

trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [15]:
# Save the trained model
# trainer.save_model("./custom_mbert")

In [16]:
from sklearn.metrics import f1_score,confusion_matrix

# Make predictions on the test set
model.eval()
predictions = []
for item in test_dataset:
    # Move input data to the same device as the model
    input_ids = item['input_ids'].to(device).unsqueeze(0)
    attention_mask = item['attention_mask'].to(device).unsqueeze(0)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    logits = outputs.logits
    predictions.append(logits.argmax(-1).item())

# Assuming test_labels is a numpy array or a list. If it's a tensor, it needs to be on CPU and converted to numpy.
if isinstance(test_labels, torch.Tensor):
    test_labels = test_labels.cpu().numpy()

# Calculate F1 score
f1 = f1_score(test_labels, predictions, average='weighted')


In [17]:
f1

0.832258064516129

In [18]:
confusion_matrix(test_labels, predictions)

array([[ 3,  3],
       [ 0, 14]])

### BERT CONVAI2

In [19]:
# Load pre-trained BERT model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = load_dataset("conv_ai_2")
df = pd.DataFrame(dataset['train'])

Downloading data:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3495 [00:00<?, ? examples/s]

In [20]:
def preprocess_dialog(dialog):
    try:
        dialog_list = dialog
        dialog_text = " ".join([d['text'] for d in dialog_list])
        return dialog_text
    except:
        return ""

def preprocess_profile(profile):
    try:
        profile_list = profile
        profile_text = ' '.join([''.join(char_list) for char_list in profile_list])
        return profile_text
    except:
        return ""

# Apply preprocessing
df['dialog_text'] = df['dialog'].apply(preprocess_dialog)
df['bot_profile_text'] = df['bot_profile'].apply(preprocess_profile)
df['user_profile_text'] = df['user_profile'].apply(preprocess_profile)

# Combine dialog and profiles
df['combined_text'] = df['bot_profile_text'] + " [SEP] " + df['user_profile_text'] + " [SEP] " + df['dialog_text']

# Display the processed data
df[['combined_text', 'profile_match']].head()

Unnamed: 0,combined_text,profile_match
0,i have amazing children and grandchildren. i c...,0
1,my father was a door to door salesman. i've th...,1
2,i am a gold medalist olympian. i love italian ...,1
3,i fantasize about taking over the world. i'm a...,1
4,i am 40 years old. i work as a car salesman. m...,1


In [21]:
# Splitting the data
train_texts, test_texts, train_labels, test_labels = train_test_split(df['combined_text'], df['profile_match'], test_size=0.2)
train_labels = train_labels.replace(-1, 2)
test_labels = test_labels.replace(-1, 2)

In [22]:
class ConvAI2Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [23]:
# Tokenize train data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
train_dataset = ConvAI2Dataset(train_encodings, torch.tensor(train_labels.values))

# Tokenize test data
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_dataset = ConvAI2Dataset(test_encodings, torch.tensor(test_labels.values))

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

Step,Training Loss
500,0.8402


TrainOutput(global_step=525, training_loss=0.8416426849365234, metrics={'train_runtime': 762.836, 'train_samples_per_second': 10.996, 'train_steps_per_second': 0.688, 'total_flos': 2206995347902464.0, 'train_loss': 0.8416426849365234, 'epoch': 3.0})

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [26]:
# Save the trained model
# trainer.save_model("./custom_mbert")

In [27]:
# Make predictions on the test set
model.eval()
predictions = []
for item in test_dataset:
    # Move input data to the same device as the model
    input_ids = item['input_ids'].to(device).unsqueeze(0)
    attention_mask = item['attention_mask'].to(device).unsqueeze(0)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    logits = outputs.logits
    predictions.append(logits.argmax(-1).item())

# Assuming test_labels is a numpy array or a list. If it's a tensor, it needs to be on CPU and converted to numpy.
if isinstance(test_labels, torch.Tensor):
    test_labels = test_labels.cpu().numpy()

# Calculate F1 score
f1 = f1_score(test_labels, predictions, average='weighted')
print(f1)
print(confusion_matrix(test_labels, predictions))

0.5698011087919965
[[143 103   2]
 [146 240   3]
 [ 27  19  16]]
