In [13]:
from transformers import (TextDataset, DataCollatorForLanguageModeling,GPT2Tokenizer,
                          GPT2LMHeadModel,Trainer, TrainingArguments)
import pandas as pd
import matplotlib.pyplot as plt
import wandb
wandb.init(mode="disabled")
import warnings
warnings.filterwarnings('ignore')

In [14]:
data=pd.read_csv('../data/sample_q_and_a/response_2.csv')
data.sample(5)

Unnamed: 0,Question,Response
6248,\nI feel like I was born in the wrong body I f...,Hi. Do you have any opportunity to work with a...
3438,My friend is abusing her prescription medicine...,Your friend needs to admit they have a problem...
6476,\nI have so many issues to address. I have a h...,Hello! You may have heard the saying that coun...
6556,\nHow does a counselor decide when to end coun...,"For a therapist, deciding to end counseling se..."
5862,\nI love my girlfriend so much. I get an erect...,I'm sorry to hear of your problem.First step a...


In [15]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7115 entries, 0 to 7114
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  7115 non-null   object
 1   Response  7111 non-null   object
dtypes: object(2)
memory usage: 111.3+ KB


In [16]:
data.to_csv('../data/sample_q_and_a/train_response2.csv')


In [17]:
def load_dataset(file_path, tokenizer, block_size = 1024):
    dataset_train = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset_train

In [18]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [19]:
def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    # Load datasets
    train_dataset = load_dataset(train_file_path, tokenizer)

    # Load data collator
    data_collator = load_data_collator(tokenizer)

    # Save tokenizer
    tokenizer.save_pretrained(output_dir)

    # Load or initialize model
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Save model
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=4,
        logging_dir="../data/sample_q_and_a/logs",
        logging_steps=100,  # Log every 100 steps
        save_steps=500,  # Save checkpoint every 500 steps
        logging_first_step=True,
        save_total_limit=2,
        learning_rate=.0001,
        fp16 = True
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    hist = trainer.train()
    trainer.save_model()
    return trainer,hist

In [20]:
train_file_path = "../data/sample_q_and_a/train_response2.csv"
model_name = 'distilgpt2'
output_dir = '../data/sample_q_and_a/custom_model'
overwrite_output_dir = True
per_device_train_batch_size = 2
num_train_epochs = 10

In [21]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer

GPT2Tokenizer(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [22]:
model = GPT2LMHeadModel.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [23]:
model.base_model


GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-5): 6 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [24]:
# Train
hist=train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
) 

Step,Training Loss
1,3.4985
100,3.2216
200,3.0241
300,2.9144
400,2.8012
500,2.7433
600,2.6247
700,2.5658
800,2.5603
900,2.4616


In [25]:
print("Global Step:", hist[1].global_step)
print("Epoch:", hist[1].metrics['epoch'])
print("Train Runtime:", hist[1].metrics['train_runtime'])
print("Train Samples Per Second:", hist[1].metrics['train_samples_per_second'])
print("Train Steps Per Second:", hist[1].metrics['train_steps_per_second'])
print("Total FLOPS:", hist[1].metrics['total_flos'])
print("Train Loss:", hist[1].metrics['train_loss'])

Global Step: 2690
Epoch: 9.99
Train Runtime: 6893.7156
Train Samples Per Second: 3.123
Train Steps Per Second: 0.39
Total FLOPS: 5620754401984512.0
Train Loss: 2.352705027002384


Generate answer using model

In [42]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):
    
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [43]:
model_path = "../data/sample_q_and_a/custom_model"
sequence = data['Question'].iloc[100]
max_len = 100
print("Q : ",data['Question'].iloc[100])
print()
print("A : ",data['Response'].iloc[100])
print()
print("G : ",generate_text(model_path, sequence, max_len)) 

Q :  I self-harm, and I stop for awhile. Then when I see something sad or depressing, I automatically want to self-harm.

A :  Self-harm has a way of becoming a go-to method of coping. When we get stuck using self-harm as a way to deal and manage emotions, when something serious happens it totally makes sense that that will be one the first things to go through the mind. It is its own kind of addiction. There is a therapy, DBT or, Dialectical Behavior Therapy, which focusing on giving you new tools to get through hard times, understand and manage your emotions, to stay present and to deal with relationships. This one of the most effective interventions there is for self-harm. If you are interested in DBT, reach a local therapist in your area and ask if they do DBT or can recommend you to a DBT program. This can help immensely. Self-harm recovery is totally possible, but it is definitely hard work!

G :  I self-harm, and I stop for awhile. Then when I see something sad or depressing, I 

In [44]:
sequence = data['Question'].iloc[50]
max_len = 200
print("Q : ",data['Question'].iloc[50])
print()
print("A : ",data['Response'].iloc[50])
print()
print("G : ",generate_text(model_path, sequence, max_len)) 

Q :  I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.
   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?

A :  There are never "too many issues" to be addressed in therapy.  Most people come in with multiple issues they want to address.  The wonderful thing about therapy, is that often, as one or more significant issues begin to change and improve-   the lead naturally without much effort to improvements in the other areas.  (For example, as you begin to address trauma and betrayal from you past, you may find that the insomnia improves). Your therapist, with you input and direction, can help you to prioritize which problem areas to target first.

G :  I have so many issues to address. I have a history of sexual abu

In [45]:
sequence = data['Question'].iloc[150]
max_len = 200
print("Q : ",data['Question'].iloc[150])
print()
print("A : ",data['Response'].iloc[150])
print()
print("G : ",generate_text(model_path, sequence, max_len)) 

Q :  There are many people willing to lovingly provide me with a home. I have food, clothes, and a university education, but I never feel like I belong. Even when I have a good time with people who are supposed to be close, I feel like I'm just out with friends and I never go home.

A :  From the little bit you wrote, my guess is you haven't figured out where and how to lay down your own set of roots.Usually people who have a university level eduction are old enough to work in order to support themselves.Unless you have a medical condition which limits or prevents you from working a full-time job, would you guess that the effort and thought involved in deciding in what professional area and geographic location to look for work, would offer you a way to establish your identity?My suggestion is to pay more attention to your own likes, dislikes, and interests.   To know these areas more deeply would define the type of people with whom you identify, have a common interest and with whom you

In [50]:
#take input from user for sequence 
sequence = input("Enter the question : ")
max_len = 200
print("Q : ",sequence)
print()
#print only answer not the sequence
print("G : ",generate_text(model_path, sequence, max_len)) 

Q :  what is love

G :  what is love and kindness which should be respected if you want to be with your boyfriend."
1639,"It was very unsettling at the time. I know I had to leave my mom to travel with me and live with my grandparents. I feel like I need to just lay waste to this.","I'm sorry you were short.  However, it can be helpful to know that at the time of giving up your ability to travel with your parents (or staying in the US) and having friends (people) on your own could have led to depression and anxiety.  It doesn't sound like you have been able to work through this at the time and your depression comes from having this relationship.  As far as your family's reaction to losing your mother, either way, I would ask you to be patient and understand that losing your mother is a loss for you and for your kids.  Your support system seems to be a great support system in your family.  I'm sure if
