In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arogoai-llm/LLM_test.csv
/kaggle/input/arogoai-llm/LLM_data.csv
/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1/config.json
/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1/trainer_state.json
/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1/training_args.bin
/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1/scheduler.pt
/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1/model.safetensors
/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1/optimizer.pt
/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1/rng_state.pth
/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1/generation_config.json


In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=9a7145f85ff5243fc33026856707de90c9cfabfab70ac85bee0e028fb8d01c20
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [4]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset

In [5]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import logging

# Set up logging
logging.basicConfig(
    filename="training_log.log",
    level=logging.INFO,
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger()

In [6]:
os.environ["WANDB_DISABLED"] = "true"

In [7]:
class EntityValueDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels["input_ids"][idx])
        return item

    def __len__(self):
        return len(self.labels["input_ids"])

class EntityValueDatasetTest(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(next(iter(self.encodings.values())))

def prepare_data(df):
    inputs = []
    targets = []
    for _, row in df.iterrows():
        input_text = f"Behave like an experienced psychiatrist and answer: {row['Context']}"
        inputs.append(f"Behave like an experienced psychiatrist and answer: {row['Context']}")
        targets.append(str(row["Response"]))
    return inputs, targets

In [8]:
def batch_predict(model, tokenizer, dataset, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    predictions = []
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            outputs = model.generate(**inputs, max_length=75, do_sample=False)
            predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])
    return predictions


train_df = pd.read_csv("/kaggle/input/arogoai-llm/LLM_data.csv")
test_df = pd.read_csv("/kaggle/input/arogoai-llm/LLM_test.csv")[0:30000]



In [9]:
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,Context,Response
0,802866,"It makes me doubt myself, Alex. I start questi...","I can understand why you feel that way, Charli..."
1,693266,"I've been feeling a mix of emotions, Alex. I'v...",It's completely understandable to have such a ...
2,670022,"Well, I've been neglecting certain aspects of ...","It takes courage to acknowledge that, Charlie...."


In [10]:
train_df = train_df.drop(columns = ['Unnamed: 0'])

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Context   299945 non-null  object
 1   Response  299983 non-null  object
dtypes: object(2)
memory usage: 4.6+ MB


In [12]:
train_df.dropna(inplace=True)

In [13]:
train_df.duplicated().sum()

4271

In [14]:
train_df.drop_duplicates(inplace=True)

In [15]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  30000 non-null  int64 
 1   Context     29991 non-null  object
 2   Response    29999 non-null  object
dtypes: int64(1), object(2)
memory usage: 703.2+ KB


In [16]:
test_df = test_df.drop(columns = ['Unnamed: 0'])

In [17]:
test_df.dropna(inplace=True)

In [18]:
test_df.duplicated().sum()

0

In [19]:
train_inputs, train_targets = prepare_data(train_df)
test_inputs, test_targets = prepare_data(test_df)

train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(train_inputs, train_targets, test_size=0.1, random_state=42)

In [20]:
train_inputs[8]

'Behave like an experienced psychiatrist and answer: I think this is a good starting point for now. I appreciate your guidance and support, Alex. I feel a bit more motivated to have that conversation with my partner and focus on building a fulfilling life for myself. Thank you.'

In [21]:
train_targets[8]

"You're very welcome, Charlie. It's been a pleasure supporting you. Remember, you have the power to create positive changes in your personal relationships and your life as a whole. Feel free to reach out whenever you feel the need for further guidance or simply to share your progress. Sending you strength and motivation in your journey ahead."

In [22]:
model_name = "google-t5/t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize data
train_encodings = tokenizer(train_inputs, truncation=True, padding=True, max_length=300)
train_target_encodings = tokenizer(train_targets, truncation=True, padding=True, max_length=75)

eval_encodings = tokenizer(eval_inputs, truncation=True, padding=True, max_length=300)
eval_target_encodings = tokenizer(eval_targets, truncation=True, padding=True, max_length=75)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [23]:
train_encodings[10]

Encoding(num_tokens=300, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [24]:
test_encodings = tokenizer(test_inputs, truncation=True, padding=True, max_length=300)

In [25]:
train_dataset = EntityValueDataset(train_encodings, train_target_encodings)
eval_dataset = EntityValueDataset(eval_encodings, eval_target_encodings)
test_dataset = EntityValueDatasetTest(test_encodings)

In [26]:
train_dataset

<__main__.EntityValueDataset at 0x79dd3804c550>

In [27]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [28]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=1e-3,

)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.6398,1.431756


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./fine_tuned_t5/tokenizer_config.json',
 './fine_tuned_t5/special_tokens_map.json',
 './fine_tuned_t5/spiece.model',
 './fine_tuned_t5/added_tokens.json',
 './fine_tuned_t5/tokenizer.json')

In [30]:
model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_t5")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [27]:
import evaluate

rouge_metric = evaluate.load("rouge")

def evaluate_responses(true_responses: list, predicted_responses: list) -> dict:
 
    results = rouge_metric.compute(
        predictions=predicted_responses,
        references=true_responses,
        use_stemmer=True
    )
    return results



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [30]:
from tqdm import tqdm
from torch.utils.data import DataLoader

def batch_predict( batch_question):
    input_texts = [
        f"Question: {Context}"
        for Context in zip(batch_question)
    ]
    inputs = tokenizer(input_texts, return_tensors="pt", max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=75, num_return_sequences=1, do_sample=False)
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

class TestDataset(Dataset):
    def __init__(self, df):
        self.question = df['Context'].tolist()

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        return  self.question[idx]

test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=40, shuffle=False)

predicted_values = []
print("Starting prediction...")
for batch in tqdm(test_loader, desc="Predicting", unit="batch"):
    batch_question = batch
    batch_predictions = batch_predict(batch_question)
    predicted_values.extend(batch_predictions)



Starting prediction...


Predicting: 100%|██████████| 5/5 [00:06<00:00,  1.31s/batch]


In [33]:
test_df.sample(2)

Unnamed: 0,Context,Response
95,"Thank you, Alex. Your words of encouragement m...","You're very welcome, Charlie. It's been a plea..."
15,"I hope they will understand, even if it takes ...",It's hopeful that you're willing to believe in...


In [34]:
test_df['predicted_value'] = predicted_values
true_responses = test_df['Response'].tolist()
predicted_responses = test_df['predicted_value'].tolist()

# Evaluate the predictions
scores = evaluate_responses(true_responses, predicted_responses)
print(f"Scores on Test Data: {scores}")

Scores on Test Data: {'rouge1': 0.21725038521003956, 'rouge2': 0.053391473196168465, 'rougeL': 0.1598250419117268, 'rougeLsum': 0.16003325130502022}


In [20]:
import torch

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats() 
torch.cuda.reset_accumulated_memory_stats() 

In [37]:
import shutil
model_dir = "/kaggle/working/results/checkpoint-11088"
shutil.make_archive("fine_tuned_t5_checkpoints_1", 'zip', model_dir)

from IPython.display import FileLink
FileLink("fine_tuned_t5_checkpoints_1.zip")

In [38]:
import shutil
model_dir = "/kaggle/working/fine_tuned_t5"

shutil.make_archive("fine_tuned_t5_epoch_1", 'zip', model_dir)

from IPython.display import FileLink
FileLink("fine_tuned_t5_epoch_1.zip")

In [None]:
# Here, we load our checkpoints from epoch 1 to continue training for epoch 2.

In [28]:
model_dir = "/kaggle/input/arogo_epoch1_checkpoints/transformers/default/1"
output_dir = "/kaggle/working/"

if os.path.exists(model_dir):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
    print("Resuming training from saved model.")
else:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    print("Initializing new model.")


Resuming training from saved model.


In [29]:
from safetensors.torch import load_file
from transformers import AutoModelForSeq2SeqLM

In [30]:


writable_model_dir = "/kaggle/working/arogo_epoch1_checkpoint"

os.makedirs(writable_model_dir, exist_ok=True)

safetensor_path = os.path.join(model_dir, "model.safetensors")
model_state_dict = load_file(safetensor_path)

torch.save(model_state_dict, os.path.join(writable_model_dir, "pytorch_model.bin"))

import shutil

required_files = ["config.json", "trainer_state.json", "training_args.bin", "scheduler.pt", "optimizer.pt", "rng_state.pth", "generation_config.json"]

for file_name in required_files:
    src = os.path.join(model_dir, file_name)
    dst = os.path.join(writable_model_dir, file_name)
    if os.path.exists(src):
        shutil.copy(src, dst)

model = AutoModelForSeq2SeqLM.from_pretrained(writable_model_dir)
print("Resuming training from saved model.")

output_dir = "/kaggle/working/"

Resuming training from saved model.


In [31]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,  # Additional epoch but it resume training for 2nd epoch
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=1e-3,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

model_dir = '/kaggle/working/arogo_epoch1_checkpoint'
trainer.train(resume_from_checkpoint=model_dir)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
2,0.8142,0.788302


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=22176, training_loss=0.4283540972211488, metrics={'train_runtime': 12625.1927, 'train_samples_per_second': 42.153, 'train_steps_per_second': 1.756, 'total_flos': 1.89889245720576e+17, 'train_loss': 0.4283540972211488, 'epoch': 2.0})

In [32]:
model.save_pretrained("./fine_tuned_t5_v2")
tokenizer.save_pretrained("./fine_tuned_t5_v2")

('./fine_tuned_t5_v2/tokenizer_config.json',
 './fine_tuned_t5_v2/special_tokens_map.json',
 './fine_tuned_t5_v2/spiece.model',
 './fine_tuned_t5_v2/added_tokens.json',
 './fine_tuned_t5_v2/tokenizer.json')

In [39]:
model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_t5_v2")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [40]:
from tqdm import tqdm
from torch.utils.data import DataLoader

def batch_predict( batch_question):
    input_texts = [
        f"Question: {Context}"
        for Context in zip(batch_question)
    ]
    inputs = tokenizer(input_texts, return_tensors="pt", max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=75, num_return_sequences=1, do_sample=False)
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

class TestDataset(Dataset):
    def __init__(self, df):
        self.question = df['Context'].tolist()

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        return  self.question[idx]

test_dataset = TestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

predicted_values = []
print("Starting prediction...")
for batch in tqdm(test_loader, desc="Predicting", unit="batch"):
    batch_question = batch
    batch_predictions = batch_predict(batch_question)
    predicted_values.extend(batch_predictions)



Starting prediction...


Predicting: 100%|██████████| 938/938 [20:05<00:00,  1.29s/batch]


In [41]:
test_df['predicted_value'] = predicted_values
true_responses = test_df['Response'].tolist()
predicted_responses = test_df['predicted_value'].tolist()

scores = evaluate_responses(true_responses, predicted_responses)
print(f"Scores on Test Data: {scores}")

Scores on Test Data: {'rouge1': 0.402221357225255, 'rouge2': 0.15462024487508397, 'rougeL': 0.2864835837035713, 'rougeLsum': 0.28646795927183055}


In [None]:
# 1st epoch values
#Scores on Test Data: {'rouge1': 0.41203100050344876, 'rouge2': 0.16337853205553043, 'rougeL': 0.29616798143440815, 'rougeLsum': 0.29625279155455775}

In [35]:
import shutil
model_dir = "/kaggle/working/fine_tuned_t5_v2"

shutil.make_archive("fine_tuned_t5_epoch_2", 'zip', model_dir)

from IPython.display import FileLink
FileLink("fine_tuned_t5_epoch_2.zip")

In [36]:
import shutil
model_dir = "/kaggle/working/checkpoint-22176"

shutil.make_archive("fine_tuned_t5_checkpoints_2", 'zip', model_dir)

from IPython.display import FileLink
FileLink("fine_tuned_t5_checkpoints_2.zip")

In [37]:
test_df['predicted_value'].iloc[80]

"(Curiously) Your commitment to finding a balance and fostering a cooperative environment is truly inspiring, Charlie. Remember, change takes time, and it's essential to be patient with yourself and your family members throughout this process. Is there anything else you'd like to discuss or any other concerns you have?"

In [38]:
test_df['Response'].iloc[80]

"It's fantastic to see your determination and willingness to take the lead in initiating a positive change within your family. Remember, change takes time and effort, so be patient with yourself and your family members as you navigate this process. Is there anything else you'd like to discuss or any additional goals you would like to set for yourself?"