In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
import torch
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2Model, pipeline,GPT2Config
from scipy.spatial.distance import jensenshannon
import numpy as np
from torch.utils.data import DataLoader, Dataset
from scipy.stats import entropy
from sklearn.feature_extraction.text import CountVectorizer
!pip install stanza
import stanza
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,ner',use_gpu=True,batch_size=500,tokenize_batch_size=500)
!pip install spacy
import spacy
nlp_spacy = spacy.load('en_core_web_sm')

Collecting stanza
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.9.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.9.2


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.9.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: ner
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!




In [None]:
def get_entity_vector(global_vocab,text):
    vectorizer = CountVectorizer(vocabulary=list(global_vocab))
    X = vectorizer.fit_transform([text.lower()])
    text_vector = X.toarray()[0]
    return text_vector

def remove_stop_words(_string):
    doc = nlp(_string)
    entities = [ent.text.lower() for ent in doc.ents]
    doc=nlp_spacy(_string)
    filtered_tokens = ' '.join([token.text.lower() for token in doc if not token.is_stop])
    return filtered_tokens,entities

def get_global_vocab(t1,t2):
    global_vocab=set()
    for string in t1:
        words=string.split()
        for word in words:
            global_vocab.add(word)
    for string in t2:
        words=string.split()
        for word in words:
            global_vocab.add(word)

    return global_vocab

def get_kl_div(p,q):
    p = np.where(p == 0, 1e-10, p)
    q = np.where(q == 0, 1e-10, q)
    p = p / np.sum(p)
    q = q / np.sum(q)
    kl_divergence = np.sum(p * np.log(p / q))
    m = 0.5 * (p + q)
    kl_pm = entropy(p, m)
    kl_qm = entropy(q, m)
    js_divergence = 0.5 * (kl_pm + kl_qm)
    return js_divergence

def get_common_entity_kldiv(text1,text2,data,global_vocab,common_entity):
    kl_div=[]
    for ent in common_entity:
        if ent=="," or ent=="\"":
            continue
        line_nos_from_t1=data['t1'][ent]
        line_nos_from_t2=data['t2'][ent]
        t1=' '.join([text1[lno] for lno in line_nos_from_t1])
        t2=' '.join([text2[lno] for lno in line_nos_from_t2])
        v1=get_entity_vector(global_vocab,t1)
        v2=get_entity_vector(global_vocab,t2)
        # print("v1 -- ",v1)
        # print("v2 -- ",v2)
        kl_div.append(get_kl_div(v1,v2))
    return kl_div

def preprocess(t1,t2):
    data={"t1":dict(),"t2":dict()}
    t1=t1.splitlines()
    t2=t2.splitlines()
    for i in range(len(t1)):
        t1[i],entities=remove_stop_words(t1[i])
        for words in entities:
            if words not in data['t1']:
                data['t1'][words]={i}
            else:
                data['t1'][words].add(i)
    for i in range(len(t2)):
        t2[i],entities=remove_stop_words(t2[i])
        for words in entities:
            if words not in data['t2']:
                data['t2'][words]={i}
            else:
                data['t2'][words].add(i)
    global_vocab=get_global_vocab(t1,t2)
    common_ent=set(data['t1'].keys()).intersection(set(data['t2'].keys()))
    missing_ent=set(data['t1'].keys()).difference(set(data['t2'].keys()))
    extra_ent=set(data['t2'].keys()).difference(set(data['t1'].keys()))
    return data,global_vocab,common_ent,extra_ent,missing_ent,t1,t2

In [None]:
def compute_loss(text1, text2):
    data,global_vocab,common_entity,extra_entity,missing_entity,text1,text2=preprocess(text1,text2)
    kl_div=get_common_entity_kldiv(text1,text2,data,global_vocab,common_entity)
    loss=np.mean(kl_div) + len(extra_entity)**2 + len(missing_entity)**2
    return loss


In [None]:
model_name = "gpt2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = GPT2Config.from_pretrained('gpt2', n_positions=4096, n_ctx=4096)
model = GPT2LMHeadModel(config).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 4096


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
# Define DPO Trainer class
class DPOTrainer:
    def __init__(self, model, tokenizer, learning_rate=1e-4):
        self.model = model  # GPT-2 model
        self.tokenizer = tokenizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def encode(self, texts):
        inputs = self.tokenizer(texts, return_tensors='pt',  max_length=4096)
        return inputs['input_ids'].to(device)  # Move input to GPU

    # Compute the loss between a pair of texts (t1, t2)
    def compute_loss(self, t1, t2):
        #calculate js divergence
        entity_loss = compute_loss(t1, t2)
        return entity_loss

    # Train step for a single batch
    def train_step(self, t1,t2):

        input_ids_t1 = self.encode(t1)
        input_ids_t2 = self.encode(t2)

        # Forward pass through GPT-2 model
        output_t1 = self.model(input_ids_t1)[0].mean(dim=1)  # Mean pool over tokens
        output_t2 = self.model(input_ids_t2)[0].mean(dim=1)  # Mean pool over tokens

        # Compute cosine similarity between embeddings
        cosine_sim = F.cosine_similarity(output_t1, output_t2).mean()
        similarity_loss = 1 - cosine_sim  # 1 - cosine similarity to minimize distance

        # Calculate total batch loss by summing losses for each pair
        entity_loss = self.compute_loss(t1, t2)  # Loss based on entities (JS, extra, missing)

        # Combine entity-based loss with similarity loss from GPT-2 embeddings
        total_loss = entity_loss + similarity_loss*100

        # Backpropagation and optimization
        self.optimizer.zero_grad()  # Clear gradients
        total_loss.backward()  # Backpropagate loss
        self.optimizer.step()  # Update model parameters

        return total_loss.item()

    # Training loop for the model
    def train(self, dataset, epochs=10):
        for epoch in range(epochs):
            total_epoch_loss = 0
            for t1,t2 in dataset:
                loss = self.train_step(str(t1),str(t2))
                print(loss)
                total_epoch_loss += loss
            print(f'Epoch {epoch + 1}, Loss: {total_epoch_loss:.4f}')


In [None]:
def getpairs():
    df=pd.read_csv('Gemma_9b.csv')
    cols=df.columns.tolist()
    rejected=cols[-5:]
    acc=cols[3]
    pairs=[]
    for i in range(len(df)):
        row=df.iloc[i]
        for c in rejected:
            pairs.append((row[acc],row[c]))
    return pairs



In [None]:
# Main training function
def train_model():
    trainer = DPOTrainer(model, tokenizer)
    dataset = getpairs()
    trainer.train(dataset)




In [None]:
# Run the training process
train_model()

1040.44970703125
1220.449462890625
1220.3408203125
1237.4493408203125
1274.4351806640625
17789.62890625
18056.6484375
18037.646484375
18020.640625
18037.59765625
9472.58984375
9665.6123046875
9540.609375
9850.58203125
9634.5986328125
2500.4775390625
2690.427490234375
2690.463134765625
2628.531494140625
2529.498779296875
21194.65625
21106.654296875
21352.611328125
20905.65234375
21437.59375
32425.455078125
32404.466796875
32416.451171875
32436.458984375
32409.49609375


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


nan
nan
nan
nan
nan
9965.564453125
9850.607421875
10045.58203125
10180.58203125
10004.55859375
9901.4677734375
nan
9773.658203125
nan
nan
21089.52734375
21800.583984375
20880.568359375
22160.599609375
21437.564453125
9386.5234375
9250.47265625
9657.4677734375
9281.5146484375
9577.50390625
6250.626953125
nan
6242.60400390625
6245.607421875
nan
nan
nan
nan
nan
nan
5393.525390625
5393.4599609375
5265.55078125
5440.56103515625
5328.5869140625
nan
5492.51513671875
5512.55322265625
nan
5512.5771484375
33538.47265625
33525.44140625
33498.46875
33892.34765625
33872.421875
9698.599609375
9665.5634765625
9745.55859375
9770.650390625
9770.5849609375
657.6095581054688
653.6061401367188
605.6215209960938
680.6340942382812
nan
1921.24951171875
1924.3035888671875
1889.3021240234375
1625.3433837890625
1810.2730712890625
15572.6162109375
15457.6416015625
15520.634765625
15476.638671875
15425.642578125
205.51963806152344
245.51077270507812
212.51913452148438
205.59364318847656
212.522216796875
2329.6042