In [18]:
pip install datasets



In [19]:
import pandas as pd
import transformers
import torch
from datasets import load_dataset
import transformers
from transformers import AutoFeatureExtractor, Wav2Vec2BertModel
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
# Load pre-trained BERT model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')


In [22]:
dataset = load_dataset("conv_ai_2")


In [23]:
df = pd.DataFrame(dataset['train'])

In [24]:
def preprocess_dialog(dialog):
    try:
        dialog_list = dialog
        dialog_text = " ".join([d['text'] for d in dialog_list])
        return dialog_text
    except:
        return ""

def preprocess_profile(profile):
    try:
        profile_list = profile
        profile_text = ' '.join([''.join(char_list) for char_list in profile_list])
        return profile_text
    except:
        return ""

# Apply preprocessing
df['dialog_text'] = df['dialog'].apply(preprocess_dialog)
df['bot_profile_text'] = df['bot_profile'].apply(preprocess_profile)
df['user_profile_text'] = df['user_profile'].apply(preprocess_profile)

# Combine dialog and profiles
df['combined_text'] = df['bot_profile_text'] + " [SEP] " + df['user_profile_text'] + " [SEP] " + df['dialog_text']

# Display the processed data
df[['combined_text', 'profile_match']].head()

Unnamed: 0,combined_text,profile_match
0,i have amazing children and grandchildren. i c...,0
1,my father was a door to door salesman. i've th...,1
2,i am a gold medalist olympian. i love italian ...,1
3,i fantasize about taking over the world. i'm a...,1
4,i am 40 years old. i work as a car salesman. m...,1


In [25]:
# Splitting the data
train_texts, test_texts, train_labels, test_labels = train_test_split(df['combined_text'], df['profile_match'], test_size=0.2)
train_labels = train_labels.replace(-1, 2)
test_labels = test_labels.replace(-1, 2)

In [26]:
class ConvAI2Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [27]:
# Tokenize train data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
train_dataset = ConvAI2Dataset(train_encodings, torch.tensor(train_labels.values))

# Tokenize test data
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_dataset = ConvAI2Dataset(test_encodings, torch.tensor(test_labels.values))


In [28]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

Step,Training Loss
500,0.8493


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=525, training_loss=0.8454052007765998, metrics={'train_runtime': 772.4727, 'train_samples_per_second': 10.859, 'train_steps_per_second': 0.68, 'total_flos': 2206995347902464.0, 'train_loss': 0.8454052007765998, 'epoch': 3.0})

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Save the trained model
trainer.save_model("./custom_mbert")

In [31]:
from sklearn.metrics import f1_score,confusion_matrix

# Make predictions on the test set
model.eval()
predictions = []
for item in test_dataset:
    # Move input data to the same device as the model
    input_ids = item['input_ids'].to(device).unsqueeze(0)
    attention_mask = item['attention_mask'].to(device).unsqueeze(0)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
    logits = outputs.logits
    predictions.append(logits.argmax(-1).item())

# Assuming test_labels is a numpy array or a list. If it's a tensor, it needs to be on CPU and converted to numpy.
if isinstance(test_labels, torch.Tensor):
    test_labels = test_labels.cpu().numpy()

# Calculate F1 score
f1 = f1_score(test_labels, predictions, average='weighted')
print(f1)
print(confusion_matrix(test_labels, predictions))

0.5572169528778569
[[120 128   0]
 [131 258   0]
 [ 24  23  15]]
