In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
from datasets import Dataset
import pandas as pd

# Load your CSV file
data = pd.read_csv('datasets/mental_health.csv')

# Combine questions and answers into a single text field
data['text'] = data.apply(lambda row: f"Q: {row['question']} A: {row['answer']}", axis=1)

# Convert to a Hugging Face dataset
dataset = Dataset.from_pandas(data[['text']])

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Fine-tune the model
trainer.train()


  return torch.load(checkpoint_file, map_location="cpu")
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 124439808


  0%|          | 0/750 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')


In [None]:
from transformers import pipeline

model = GPT2LMHeadModel.from_pretrained('./fine-tuned-gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-gpt2')

# Use the model for text generation
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
response = generator("I feel anxious", max_length=50, num_return_sequences=1)
print(response[0]['generated_text'])
