In [1]:
%pip install transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel, BertTokenizer
import torch
import os
import csv

# Load the BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
# Function to get embeddings
def get_embeddings(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
    # Use the mean of the last hidden layer's output as the embedding
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Function to load data from json file and append to a list
def load_data(filename):
    # print('load_data was called')
    with open(filename, 'r') as f:
        data = json.load(f)
    # Return only the first 3 verses
    # return data["resultset"]["row"][:100]
    return data["resultset"]["row"]
    
# Load data from each file
files = ['./json/t_asv.json', './json/t_bbe.json', './json/t_kjv.json', './json/t_web.json', './json/t_ylt.json']
data = {}
for file in files:
    version_code = os.path.splitext(os.path.basename(file))[0]
    print(version_code)
    data[version_code] = load_data(file)
    
print(data[version_code][0:1])
# Initialize list for storing verse objects
set_of_verse_versions = []

# Build verse objects
for uid in data['t_asv']:
    verse_uid = uid["field"][0]
    book = uid["field"][1]
    chapter = uid["field"][2]
    verse_number = uid["field"][3]
    # Skip verses greater than 20
    # if verse_number > 100:
    #     continue
    for version_code, verses in data.items():
        for verse in verses:
            if verse["field"][0] == verse_uid:
                # print(verse_uid, version_code)
                verse_text = verse["field"][4]
                set_of_verse_versions.append({"verse_uid": verse_uid, "book": book, "chapter": chapter, "verse_number": verse_number, "verse_text": verse_text, "version_code": version_code})




# Calculate cosine similarity
resultsFilename = 'cosine_similarity_results.tsv'
logFilename = 'processed_verses.log'
header_needed = not (os.path.isfile(resultsFilename) and os.path.getsize(resultsFilename) > 0)

# Check if the log file exists, if not create it
if not os.path.exists(logFilename):
    with open(logFilename, 'w') as log_file:
        pass

with open(resultsFilename, 'a') as file, open(logFilename, 'r') as log_file:
    writer = csv.writer(file, delimiter='\t')
    
    if header_needed:
        writer.writerow(["verse_uid", "version_code_1", "version_code_2", "cosine_similarity"]) # writing headers

    processed_verses = log_file.read().splitlines() # Get all processed verses

    cosine_sim_results = []
    for i in range(len(set_of_verse_versions)):
        for j in range(i+1, len(set_of_verse_versions)):
            if set_of_verse_versions[i]['verse_uid'] == set_of_verse_versions[j]['verse_uid'] and str(set_of_verse_versions[i]['verse_uid']) not in processed_verses:
                emb_i = get_embeddings(set_of_verse_versions[i]['verse_text'])
                emb_j = get_embeddings(set_of_verse_versions[j]['verse_text'])
                sim = cosine_similarity(emb_i, emb_j)
                cosine_sim_results.append({"verse_uid": set_of_verse_versions[i]['verse_uid'], "version_code_1": set_of_verse_versions[i]['version_code'], "version_code_2": set_of_verse_versions[j]['version_code'], "cosine_similarity": sim[0][0]})

                if len(cosine_sim_results) >= 10: # Write every 10 results
                    for result in cosine_sim_results:
                        writer.writerow([result["verse_uid"], result["version_code_1"], result["version_code_2"], result["cosine_similarity"]])
                    cosine_sim_results = [] # Reset the results

                    # Save the processed verse_uid
                    with open(logFilename, 'a') as log_file:
                        log_file.write(str(set_of_verse_versions[i]['verse_uid']) + '\n')



# Write remaining results to file
if cosine_sim_results: # Check if there are any remaining results that haven't been written to file
    with open(resultsFilename, 'a') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerows(cosine_sim_results)



t_asv
t_bbe
t_kjv
t_web
t_ylt
[{'field': [1001001, 1, 1, 1, "In the beginning of God's preparing the heavens and the earth --"]}]


In [25]:
# If log file is corrupted, this script can be used to generate a log file of verses with exactly 10 instances

import pandas as pd

# Read your tsv file into a pandas dataframe
df = pd.read_csv('cosine_similarity_results.tsv', sep='\t')

# Get the counts of each unique 'verse_uid'
counts = df['verse_uid'].value_counts()

# Filter those 'verse_uid' with exactly 10 instances
required_uids = counts[counts == 10].index

# Sort 'verse_uid' in descending order
sorted_uids = sorted(required_uids, reverse=False)

# Save these 'verse_uid' to a log file
with open('processed_verses.log', 'w') as f:
    for uid in sorted_uids:
        f.write(f'{uid}\n')


  df = pd.read_csv('cosine_similarity_results.tsv', sep='\t')


TypeError: '<' not supported between instances of 'str' and 'int'