In [10]:
import os
import json
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

[nltk_data] Downloading package punkt_tab to /home/jupyter-
[nltk_data]     vojta/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
try:
    file_data = json.load(open(os.path.expanduser("~/ServiceAccountsKey.json")))
    # (2) transform the content into crendentials object
    credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
    scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
    gc = gspread.Client(auth=scoped_credentials)
    grela_gs = gc.open_by_url("https://docs.google.com/spreadsheets/d/1QroTEQ9gQf9cLO9mvolp7fELNbYYjgvGd48yAiTj03w/edit?usp=sharing")
except:
    pass

In [3]:

df = pd.read_parquet('../data/register_matches_v2.parquet')
df

Unnamed: 0,sentence_id,subwork_id,text,vulgate_text,score,citation,vulgate_sentence_id
366,cc_10265_366,23,Quare faciem tuam avertis oblivisceris inopiae...,quare faciem tuam avertis oblivisceris inopiae...,0.968129,Psalms 43.24,vulgate_tlg0527.tlg027.obi-lat:43.24
2113,cc_10265_2113,155,Posui te hodie super gentes et regna ut evella...,ecce constitui te hodie super gentes et super ...,0.966330,Jeremiah 1.10,vulgate_tlg0527.tlg049.obi-lat:1.10
38,cc_10265_38,3,Denique timor et tremor venerunt super me et c...,timor et tremor venit super me et contexit me ...,0.926833,Psalms 54.6,vulgate_tlg0527.tlg027.obi-lat:54.6
3312,cc_10265_3312,234,Et illud Tanquam prodigium factus sum multis e...,tamquam prodigium factus sum multis et tu adiu...,0.907632,Psalms 70.7,vulgate_tlg0527.tlg027.obi-lat:70.7
363,cc_10265_363,23,Non enim sunt condignae passiones hujus tempor...,existimo enim quod non sunt condignae passione...,0.902710,Romans 8.18,vulgate_tlg0031.tlg006.obi-lat:8.18
...,...,...,...,...,...,...,...
648,cc_10265_648,45,Admonemus igitur et ex parte apostolorum Petri...,ut haec tibi verbis illius diceremus obsecro u...,0.600114,Genesis 50.17,vulgate_tlg0527.tlg001.obi-lat:50.17
5031,cc_10265_5031,343,Quocirca dilectionem vestram monentes ex parte...,gaudium enim magnum habui et consolationem in ...,0.600100,Philemon 1.7,vulgate_tlg0031.tlg018.obi-lat:1.7
4694,cc_10265_4694,325,et quod maximum est in Christiana religione qu...,qui enim bene ministraverint gradum sibi bonum...,0.600097,1 Timothy 3.13,vulgate_tlg0031.tlg015.obi-lat:3.13
4291,cc_10265_4291,304,Si igitur illum resipuisse et ad catholicam fi...,quod si Christus non resurrexit vana est fides...,0.600036,1 Corinthians 15.17,vulgate_tlg0031.tlg007.obi-lat:15.17


In [5]:
#jaccard similarity

def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if len(union) > 0 else 0.0

def rowwise_jaccard(list1, list2):
    similarities = []
    for s1, s2 in zip(list1, list2):
        tokens1 = set(word_tokenize(s1.lower()))
        tokens2 = set(word_tokenize(s2.lower()))
        sim = jaccard_similarity(tokens1, tokens2)
        similarities.append(sim)
    return similarities

# Example usage:
df['jaccard_sim'] = rowwise_jaccard(df['text'],df['vulgate_text'])

In [6]:
def levenshtein_distance(s1, s2):
    # Create a matrix to store distances
    matrix = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]

    # Initialize first row and column
    for i in range(len(s1) + 1):
        matrix[i][0] = i
    for j in range(len(s2) + 1):
        matrix[0][j] = j

    # Calculate Levenshtein distance
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            if s1[i-1] == s2[j-1]:
                matrix[i][j] = matrix[i-1][j-1]
            else:
                matrix[i][j] = min(
                    matrix[i-1][j] + 1,    # deletion
                    matrix[i][j-1] + 1,    # insertion
                    matrix[i-1][j-1] + 1   # substitution
                )

    return matrix[len(s1)][len(s2)]

def rowwise_levenshtein_similarity(list1, list2):
    similarities = []
    for s1, s2 in zip(list1, list2):
        if not s1 or not s2:
            similarities.append(0.0)
            continue
        max_length = max(len(s1), len(s2))
        if max_length == 0:
            similarities.append(1.0)
            continue
        distance = levenshtein_distance(s1, s2)
        similarity = 1 - (distance / max_length)
        similarities.append(similarity)
    return similarities

df['levenshtein_sim'] = rowwise_levenshtein_similarity(df['text'], df['vulgate_text'])

In [7]:
df.head(5)

Unnamed: 0,sentence_id,subwork_id,text,vulgate_text,score,citation,vulgate_sentence_id,jaccard_sim,levenshtein_sim
366,cc_10265_366,23,Quare faciem tuam avertis oblivisceris inopiae...,quare faciem tuam avertis oblivisceris inopiae...,0.968129,Psalms 43.24,vulgate_tlg0527.tlg027.obi-lat:43.24,1.0,0.987342
2113,cc_10265_2113,155,Posui te hodie super gentes et regna ut evella...,ecce constitui te hodie super gentes et super ...,0.96633,Jeremiah 1.10,vulgate_tlg0527.tlg049.obi-lat:1.10,0.8125,0.869919
38,cc_10265_38,3,Denique timor et tremor venerunt super me et c...,timor et tremor venit super me et contexit me ...,0.926833,Psalms 54.6,vulgate_tlg0527.tlg027.obi-lat:54.6,0.416667,0.75
3312,cc_10265_3312,234,Et illud Tanquam prodigium factus sum multis e...,tamquam prodigium factus sum multis et tu adiu...,0.907632,Psalms 70.7,vulgate_tlg0527.tlg027.obi-lat:70.7,0.583333,0.830769
363,cc_10265_363,23,Non enim sunt condignae passiones hujus tempor...,existimo enim quod non sunt condignae passione...,0.90271,Romans 8.18,vulgate_tlg0031.tlg006.obi-lat:8.18,0.666667,0.766355


In [13]:
set_with_dataframe(grela_gs.add_worksheet("grela_register_matches_v2", 1,1), df)