In [1]:
!pip install datasets transformers[torch] accelerate -U



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, AutoTokenizer
from tqdm.auto import tqdm

In [11]:
# Load the dataset
dataset = load_dataset("Amod/mental_health_counseling_conversations")
train_dataset = dataset['train']
dataset

Downloading readme:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})

In [14]:
# Use a pre-trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
# Tokenize the inputs and labels
def preprocess_function(examples):
    inputs = [q + " " + c for q, c in zip(examples['question'], examples['context'])]
    targets = [ans['text'][0] for ans in examples['answers']]  # Assuming single answer

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

TypeError: string indices must be integers

In [5]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [6]:
from torch.utils.data import DataLoader
from transformers import AdamW
import torch

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)




GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
import torch

# Training parameters
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Progress bar for training
progress_bar = tqdm(range(num_training_steps))

# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# Validation loop
model.eval()
total_loss = 0
all_predictions = []
all_labels = []

for batch in validation_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])

    # Compute validation loss
    loss = outputs.loss
    total_loss += loss.item()

    # Extract predictions and labels
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Flatten the predictions and labels
    flat_predictions = predictions.view(-1).cpu().numpy()
    flat_labels = batch['labels'].view(-1).cpu().numpy()

    # Mask out -100 values in labels
    mask = flat_labels != -100
    filtered_predictions = flat_predictions[mask]
    filtered_labels = flat_labels[mask]

    all_predictions.extend(filtered_predictions)
    all_labels.extend(filtered_labels)

# Compute metrics
average_loss = total_loss / len(validation_dataloader)
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')

print(f"Validation Loss: {average_loss}")
print(f"Validation Accuracy: {accuracy}")
print(f"Validation F1 Score: {f1}")


  0%|          | 0/261 [00:00<?, ?it/s]

Validation Loss: 3.8495463620532644
Validation Accuracy: 0.08441155407018382
Validation F1 Score: 0.02084575957791366


In [9]:
# Function to generate answer
def generate_answer(model, tokenizer, question, context):
    input_text = question + " " + context
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate the output
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1)

    # Decode the output
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Post-process to extract the answer (after the input text)
    answer_start = len(tokenizer.decode(input_ids[0], skip_special_tokens=True))
    answer = output_text[answer_start:].strip()

    return answer

# Example question and context in Arabic
question = "- من هو جمال أحمد حمزة خاشقجي؟"
context = "جمال أحمد حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 2018)، صحفي وإعلامي سعودي، رأس عدّة مناصب لعدد من الصحف في السعودية، وتقلّد منصب مستشار، كما أنّه مدير عام قناة العرب الإخبارية سابقًا."

# Generate the answer
generated_answer = generate_answer(model, tokenizer, question, context)

# Print the UTF-8 encoded answer
print("Generated Answer:", generated_answer)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Answer: ي����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������
