In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, OPTForCausalLM, RobertaTokenizer, RobertaForMaskedLM
from datasets import load_dataset
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import json

In [3]:
# Function to compute the KL divergence
def kl_divergence(p, q):
    """Compute KL Divergence between two probability distributions."""
    return torch.sum(p * (p / q).log(), dim=-1)

In [4]:
def find_accur(sorted_labels, label_num):
    total_labels = len(sorted_labels)
    five_perc = int(total_labels * 0.05)
    granular_range = int(five_perc * 0.01)  # For the granular plot
    one_percent_range = int(total_labels * 0.01)  # For the table, 1% of total labels
    perc_accuracy = []
    table_data = []  # Data for table using 1% increments
    plot_data = []  # Data for plot using granular range

    # # Loop for the granular plot data
    # for i in range(0, five_perc, granular_range):
    #     segment = sorted_labels[i:i + granular_range]
    #     correct_predictions = sum(1 for label in segment if label == label_num)
    #     accuracy = correct_predictions / len(segment) if segment else 0
    #     perc_accuracy.append(accuracy)
    #     plot_percentile = (i / five_perc) * 5
    #     plot_data.append([plot_percentile, accuracy])

    # Loop for the table data using 1% increments
    for i in range(0, five_perc, one_percent_range):
        segment = sorted_labels[i:i + one_percent_range]
        correct_predictions = sum(1 for label in segment if label == label_num)
        accuracy = correct_predictions / len(segment) if segment else 0
        table_percentile = ((i + one_percent_range) / total_labels) * 100
        table_data.append([table_percentile, accuracy])
    
    # # Plotting using granular range data
    # plt.figure(figsize=(10, 6))
    # sns.lineplot(x=[x[0] for x in plot_data], y=perc_accuracy)
    # plt.title("Model Accuracy in the First 5%")
    # plt.xlabel("Percentile (up to 5%)")
    # plt.ylabel("Accuracy")
    # plt.show()

    # Creating and returning the DataFrame using 1% increment data
    df = pd.DataFrame(table_data, columns=["Percentile", "Accuracy"])
    return df

In [5]:
# return a prob for the masked position given the text parsed in (parse text contains mask)
def masked_logits(text):

    # Tokenize the input text
    text_tokens = [tokenizer.cls_token] + tokenizer.tokenize(text) + [tokenizer.sep_token]

    # Convert tokens to input tensor format
    input_ids = tokenizer.convert_tokens_to_ids(text_tokens)
    input_tensor = torch.tensor([input_ids]).to(device)
    
    # Get the output logits from the model
    # Gradients not needed since this is an inference not a training
    with torch.no_grad():
        outputs = model(input_tensor)
        predictions = outputs.logits

    # Extract the logits for the masked position
    masked_position = input_ids.index(tokenizer.mask_token_id)
    masked_logits = predictions[0, masked_position, :]

    # Calculate probabilities of masked_logits
    probabilities = F.softmax(masked_logits, dim=-1)

    return probabilities

In [7]:
# Open the JSON file for reading
with open('SimSeed/data/agnews/seedwords.json', 'r') as file:
    # Parse the JSON file
    data = json.load(file)

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Access data
print("Seed word below:")
print(data)
print()

class_tokens = {category: [] for category in data}
class_tensors = {category: torch.zeros((1, 1, 50265)) for category in data}

# Tokenize seed words and assign probabilities
for category, words in data.items():
    for word in words:
        class_tokens[category].extend(tokenizer.tokenize(word))
        class_tokens[category].extend(tokenizer.tokenize(' ' + word))   
    word_ids = tokenizer.convert_tokens_to_ids(class_tokens[category])
    prob = 1 / len(class_tokens[category])
    for wid in word_ids:
        class_tensors[category][0, 0, wid] = prob

# # word_ids = tokenizer.convert_tokens_to_ids(words)
print(class_tokens)
# # print(words)
# print(word_ids)
# print(class_tensors['technology'])

# # To find the indices of non-zero elements in the 'technology' tensor
# non_zero_indices = torch.nonzero(class_tensors['technology'], as_tuple=True)

# # Printing the indices of non-zero terms
# non_zero_indices

Seed word below:
{'politics': ['government', 'military', 'war', 'iraq', 'palestinian'], 'sports': ['basketball', 'football', 'athletes', 'championship', 'yankees'], 'business': ['stocks', 'markets', 'industries', 'oil', 'sales'], 'technology': ['computer', 'telescope', 'software', 'microsoft', 'space']}

{'politics': ['government', 'Ġgovernment', 'military', 'Ġmilitary', 'war', 'Ġwar', 'ira', 'q', 'Ġir', 'aq', 'pal', 'est', 'inian', 'Ġpal', 'est', 'inian'], 'sports': ['basketball', 'Ġbasketball', 'football', 'Ġfootball', 'ath', 'letes', 'Ġathletes', 'ch', 'ampions', 'hip', 'Ġchampionship', 'yan', 'kees', 'Ġy', 'an', 'kees'], 'business': ['stocks', 'Ġstocks', 'markets', 'Ġmarkets', 'indust', 'ries', 'Ġindustries', 'oil', 'Ġoil', 's', 'ales', 'Ġsales'], 'technology': ['computer', 'Ġcomputer', 'tel', 'esc', 'ope', 'Ġtelescope', 'software', 'Ġsoftware', 'microsoft', 'Ġmicro', 'soft', 'space', 'Ġspace']}


In [6]:
tokenizer.tokenize(" palestian")

['Ġpal', 'est', 'ian']

In [8]:
# Load dataset
dataset_test = load_dataset("ag_news", split="test") #

# # Shuffle the dataset
# shuffled_train_dataset = dataset_test.shuffle(seed=42)

# # Select the first 30%
# dataset_test = shuffled_train_dataset.select(range(int(len(shuffled_train_dataset) * 0.3)))

# Allocate GPU
device = torch.device("cuda:4") #

# Initialize the tokenizer and model
opt_model_size = ['1.3b'] # Testing
# opt_model_size = ['125m', '350m', '1.3b', '2.7b', '6.7b'] # '13b'
class_name = ['sports', 'politics', 'technology', 'business']
class_label = [1, 0, 3, 2]
results = []

for name, label in zip(class_name, class_label):
    print(name + ' with label ' + str(label))    
    for size in opt_model_size: 
        print('Size of model = ' + size)
        data = {'Model Size': [], 'Class Name': [], 'Tokenized Word': [], 'Original Text': [], 'KL Divergence': [], 'Label': []}
        
        # # Initialize the tokenizer and model
        # tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        # model = RobertaForMaskedLM.from_pretrained('roberta-large').to(device) 
        
        tokenizer_1 = AutoTokenizer.from_pretrained('facebook/opt-' + size)
        model_1 = OPTForCausalLM.from_pretrained('facebook/opt-' + size).to(device)
    
        # # Compute probability at the end of text_p
        # text_p = name + " term:" + tokenizer.mask_token + "." # Hyperparameter
        probs_p = class_tensors[name].to(device)
        
        kls = []
        labels = []
        
        with torch.no_grad():
            for example in tqdm(dataset_test):
                text_q = example['text']
                labels.append(example['label'])
                inputs_q = tokenizer_1(text_q, truncation=True, padding=True, return_tensors="pt").to(device)
                logits_q = model_1(**inputs_q).logits
                probs_q = F.softmax(logits_q, dim=-1)
                probs_q = probs_q[..., :probs_p.shape[-1]]
                kl_div = kl_divergence(probs_p, probs_q)
                tokenized_word = tokenizer_1.decode(inputs_q.input_ids[0, torch.argmin(kl_div, -1).item()])
                label_name = class_name[class_label.index(example['label'])]
                # print(tokenizer.decode(inputs_q.input_ids[0, :torch.argmin(kl_div, -1).item()]) + "["
                #      + tokenizer.decode(inputs_q.input_ids[0, torch.argmin(kl_div, -1).item()]) + "]"
                #      + tokenizer.decode(inputs_q.input_ids[0, torch.argmin(kl_div, -1).item()+1:]))
                # print(tokenized_word, label_name)
                kls.append(kl_div.amin().item()) 

                data['Class Name'].append(name)
                data['Model Size'].append(size)
                data['Tokenized Word'].append(tokenized_word)
                data['Original Text'].append(text_q)
                data['KL Divergence'].append(kls[-1])
                data['Label'].append(label_name)

        df = pd.DataFrame(data)
        sorted_df = df.sort_values(by='KL Divergence')
        top_100_df = sorted_df.head(100)
        
        results.append(top_100_df)
        
# Combine all results into a single DataFrame
final_df = pd.concat(results)
final_df.to_csv("output_opt_1.csv", index=False)

sports with label 1
Size of model = 1.3b


  0%|          | 0/7600 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


politics with label 0
Size of model = 1.3b


  0%|          | 0/7600 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


technology with label 3
Size of model = 1.3b


  0%|          | 0/7600 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


business with label 2
Size of model = 1.3b


  0%|          | 0/7600 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
# logits = model(**tokenizer("I love basketball", truncation=True, padding=True, return_tensors="pt").to(device)).logits

In [7]:
# logits.softmax(-1).shape

In [8]:
# tokenizer_opt = AutoTokenizer.from_pretrained('facebook/opt-350m')

In [9]:
# Precompute logits for the baseline text (text_p)
# text_p = "Sports term:"
# logits_p = model(**tokenizer(text_p, truncation=True, padding=True, return_tensors="pt").to(device)).logits
# logits_p = logits_p[0, -1, :]
# probs_p = F.softmax(logits_p, dim=-1) 
# probs_p
# logits_p = masked_logits(text_p).unsqueeze(0)

In [10]:
# Compute KL divergence for every text in the dataset
# kls = []
# labels = []

# with torch.no_grad():
#     for example in tqdm(dataset_test):
#         text_q = example['text']
#         labels.append(example['label'])
#         logits_q = model(**tokenizer(text_q, truncation=True, padding=True, return_tensors="pt").to(device)).logits
#         probs_q = F.softmax(logits_q, dim=-1)
#         kls.append(kl_divergence(probs_p, probs_q).amin().item()) 

In [11]:
# kl_divergence(probs_p, probs_q).amin()

In [12]:
# logits_p.argsort(-1).flip(-1)

In [13]:
# vocab = tokenizer.get_vocab()
# vocab = {vocab[v]:v for v in vocab}
# for idx in logits_p.argsort(-1).flip(-1)[0, :20]:
#     print(vocab[idx.item()])

In [14]:
# np.argsort(kls)

In [15]:
# sorted_labels = [labels[idx] for idx in np.argsort(kls)]
# sorted_labels

In [16]:
# find_accur(1)

In [17]:
# # Precompute logits for the baseline text (World)
# text_p = "Political term:" + tokenizer.mask_token + "."
# logits_p = masked_logits(text_p).unsqueeze(0)

In [18]:
# # Compute KL divergence for every text in the dataset
# kls = []
# labels = []

# with torch.no_grad():
#     for example in tqdm(dataset_test):
#         text_q = example['text']
#         labels.append(example['label'])
#         logits_q = get_masked_logits(text_q, batch_size=512)
#         kls.append(kl_divergence(logits_p, logits_q).amin(0).item()) 

In [19]:
# sorted_labels = [labels[idx] for idx in np.argsort(kls)]
# sorted_labels

In [20]:
# find_accur(0)

In [21]:
# # Precompute logits for the baseline text (Business)
# text_p = "Business term:" + tokenizer.mask_token + "."
# logits_p = masked_logits(text_p).unsqueeze(0)

In [22]:
# # Compute KL divergence for every text in the dataset
# kls = []
# labels = []

# with torch.no_grad():
#     for example in tqdm(dataset_test):
#         text_q = example['text']
#         labels.append(example['label'])
#         logits_q = get_masked_logits(text_q, batch_size=512)
#         kls.append(kl_divergence(logits_p, logits_q).amin(0).item()) 

In [23]:
# sorted_labels = [labels[idx] for idx in np.argsort(kls)]
# sorted_labels

In [24]:
# find_accur(2)

In [25]:
# # Precompute logits for the baseline text (Sci/Tech)
# text_p = "Technology term:" + tokenizer.mask_token + "."
# logits_p = masked_logits(text_p).unsqueeze(0)

In [26]:
# # Compute KL divergence for every text in the dataset
# kls = []
# labels = []

# with torch.no_grad():
#     for example in tqdm(dataset_test):
#         text_q = example['text']
#         labels.append(example['label'])
#         logits_q = get_masked_logits(text_q, batch_size=512)
#         kls.append(kl_divergence(logits_p, logits_q).amin(0).item()) 

In [27]:
# sorted_labels = [labels[idx] for idx in np.argsort(kls)]
# sorted_labels

In [28]:
# find_accur(3)

In [29]:
# kls

In [30]:
# lengths = [len(tokenizer.tokenize(data["text"])) for data in dataset_test]

In [31]:
# lengths

In [32]:
# plt.scatter(lengths, kls)