In [10]:
from qudsim import preprocess_docs
from qudsim import generate_quds
from qudsim import align_documents
from qudsim import reduce_df
from qudsim import id_documents
from qudsim import build_pairs_metadata
import pandas as pd
import config
from tools import openai

In [11]:
example_query_set = pd.read_json("example_query.json")

# Setup (models, configuration, etc)

To generate QUDs at varying levels of abstraction, we can set ```level``` to ```0``` for high specificity or ```1``` for generic QUDs. The default (defined in ```/config.py/```) is ```1```. 

Queries in the dataset can be treated independently (```with_replacement=False```) or such that repeated instances of a document are considered as one (```with_replacement=True```).

In [12]:
qg_gpt_model_name = config.GPT_MODEL
qa_gpt_model_name = config.GPT_MODEL
level = config.LEVEL
query_file_path = "./example_query.json"
with_replacement = True

In [13]:
if level!=0 and level!=1:
    print("Levels 0 and 1 are supported, 0 being specific and 1 being abstract. Specified value is currently unsupported.")

try:
    try:
        query_df = pd.read_json(query_file_path)
    except:
        query_df = pd.read_csv(query_file_path)
except:
    print("Please provide input dataframe file in a json or csv format.")


try:
    qg_gpt_model = openai.GPT(qg_gpt_model_name)
except Exception as e:
    print(e)
    print("could not instantiate GPT client with provided model name: %s", qg_gpt_model_name)


try:
    qa_gpt_model = openai.GPT(qa_gpt_model_name)
except Exception as e:
    print(e)
    print("could not instantiate GPT client with provided model name: %s", qa_gpt_model_name)

# Preprocessing

In [14]:
df = id_documents(with_replacement, level, query_df)

### Find and ID the documents

In [15]:
df = preprocess_docs(df)

Preprocessing: 100%|███████████████████████████████████████████████████████| 8/8 [00:00<00:00, 1356.39it/s]


In [16]:
df

Unnamed: 0,ID,Level,Model,Text,Numbered Text,Sentence_Number_Dict
0,28240,1,claude-3-5-sonnet-20241022,"David Lynch, the surrealist filmmaker and arti...","[1] David Lynch, the surrealist filmmaker and ...","{1: 'David Lynch, the surrealist filmmaker and..."
1,76831,1,gemini-1.5-flash-002,## David Lynch (1946-2025): A Visionary's Drea...,[1] ## David Lynch (1946-2025): A Visionary's ...,{1: '## David Lynch (1946-2025): A Visionary's...
2,43796,1,claude-3-5-sonnet-20241022,I stared at the black card in my trembling han...,[1] I stared at the black card in my trembling...,{1: 'I stared at the black card in my tremblin...
3,88150,1,gpt-4o-2024-08-06,After years of relentless searching and countl...,[1] After years of relentless searching and co...,{1: 'After years of relentless searching and c...
4,68659,1,gpt-4o-2024-08-06,"In the heart of ancient Macedonia, Philip II a...","[1] In the heart of ancient Macedonia, Philip ...","{1: 'In the heart of ancient Macedonia, Philip..."
5,22968,1,gemini-1.5-flash-002,The sun beat down on the rough-hewn hills of a...,[1] The sun beat down on the rough-hewn hills ...,{1: 'The sun beat down on the rough-hewn hills...
6,79226,1,Human,`` Disengaging from primary module . Do you co...,[1] `` Disengaging from primary module .[2] Do...,"{1: '`` Disengaging from primary module .', 2:..."
7,93275,1,gpt-4o-2024-08-06,Sammy always felt more at home when he gazed u...,[1] Sammy always felt more at home when he gaz...,{1: 'Sammy always felt more at home when he ga...


# Generate QUDs

This encompasses the process of segmenting, optionally abstracting entities, and finally generating QUDs for each (abstracted) segment.

In [17]:
df = generate_quds(qg_gpt_model, df)
try:
    df.to_json('output/document_data.json')
except:
    print("Could not save document dataframe")

QUD Generation: 100%|████████████████████████████████████████████████████████| 8/8 [01:50<00:00, 13.86s/it]


In [18]:
df

Unnamed: 0,ID,Level,Model,Text,Numbered Text,Sentence_Number_Dict,Segments,Abstracted Entities,QUDs,QUD to Segment Dict,Segment to QUD Dict
0,28240,1,claude-3-5-sonnet-20241022,"David Lynch, the surrealist filmmaker and arti...","[1] David Lynch, the surrealist filmmaker and ...","{1: 'David Lynch, the surrealist filmmaker and...","{""segmentation"":[{""sentences"":[1,2,3]},{""sente...","{""decontextualized_paragraphs"":[{""para_num"":0,...","[{""num_quds"":1,""quds"":[{""qud"":""What happened t...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 4}","{0: [0], 1: [1], 2: [2], 3: [3, 4], 4: [5]}"
1,76831,1,gemini-1.5-flash-002,## David Lynch (1946-2025): A Visionary's Drea...,[1] ## David Lynch (1946-2025): A Visionary's ...,{1: '## David Lynch (1946-2025): A Visionary's...,"{""segmentation"":[{""sentences"":[1,2]},{""sentenc...","{""decontextualized_paragraphs"":[{""para_num"":0,...","[{""num_quds"":1,""quds"":[{""qud"":""What is the sig...","{0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 4, 6: 5, 7: ...","{0: [0], 1: [1], 2: [2, 3], 3: [4], 4: [5], 5:..."
2,43796,1,claude-3-5-sonnet-20241022,I stared at the black card in my trembling han...,[1] I stared at the black card in my trembling...,{1: 'I stared at the black card in my tremblin...,"{""segmentation"":[{""sentences"":[1,2]},{""sentenc...","{""decontextualized_paragraphs"":[{""para_num"":0,...","[{""num_quds"":2,""quds"":[{""qud"":""What is the nar...","{0: 0, 1: 0, 2: 1, 3: 1, 4: 2, 5: 3, 6: 4}","{0: [0, 1], 1: [2, 3], 2: [4], 3: [5], 4: [6]}"
3,88150,1,gpt-4o-2024-08-06,After years of relentless searching and countl...,[1] After years of relentless searching and co...,{1: 'After years of relentless searching and c...,"{""segmentation"":[{""sentences"":[1,2,3]},{""sente...","{""decontextualized_paragraphs"":[{""para_num"":0,...","[{""num_quds"":1,""quds"":[{""qud"":""What motivates ...","{0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}","{0: [0], 1: [1], 2: [2, 3], 3: [4], 4: [5], 5:..."
4,68659,1,gpt-4o-2024-08-06,"In the heart of ancient Macedonia, Philip II a...","[1] In the heart of ancient Macedonia, Philip ...","{1: 'In the heart of ancient Macedonia, Philip...","{""segmentation"":[{""sentences"":[1,2,3]},{""sente...","{""decontextualized_paragraphs"":[{""para_num"":0,...","[{""num_quds"":2,""quds"":[{""qud"":""Who rises to po...","{0: 0, 1: 0, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: ...","{0: [0, 1], 1: [2], 2: [3, 4], 3: [5], 4: [6, ..."
5,22968,1,gemini-1.5-flash-002,The sun beat down on the rough-hewn hills of a...,[1] The sun beat down on the rough-hewn hills ...,{1: 'The sun beat down on the rough-hewn hills...,"{""segmentation"":[{""sentences"":[1,2,3,4,5]},{""s...","{""decontextualized_paragraphs"":[{""para_num"":0,...","[{""num_quds"":2,""quds"":[{""qud"":""How did the lea...","{0: 0, 1: 0, 2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: ...","{0: [0, 1], 1: [2, 3], 2: [4, 5], 3: [6, 7], 4..."
6,79226,1,Human,`` Disengaging from primary module . Do you co...,[1] `` Disengaging from primary module .[2] Do...,"{1: '`` Disengaging from primary module .', 2:...","{""segmentation"":[{""sentences"":[1,2,3,4,5,6,7,8...","{""decontextualized_paragraphs"":[{""para_num"":0,...","[{""num_quds"":1,""quds"":[{""qud"":""What actions ar...","{0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 3, 6: 4, 7: ...","{0: [0], 1: [1, 2], 2: [3], 3: [4, 5], 4: [6, ..."
7,93275,1,gpt-4o-2024-08-06,Sammy always felt more at home when he gazed u...,[1] Sammy always felt more at home when he gaz...,{1: 'Sammy always felt more at home when he ga...,"{""segmentation"":[{""sentences"":[1,2,3]},{""sente...","{""decontextualized_paragraphs"":[{""para_num"":0,...","[{""num_quds"":1,""quds"":[{""qud"":""How does the ch...","{0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 3, 6: 4, 7: 5}","{0: [0], 1: [1, 2], 2: [3], 3: [4, 5], 4: [6],..."


# Document Alignment Using QUDs

Using the QUDs generated in the previous step, we can now compare documents and align their segments through answerability. We calculate similarity scores for each pair of segments and then use the ```THRESHOLD``` value set in ```/config.py/```, to find alignment. 

In [19]:
pairs_df = build_pairs_metadata(query_df, df)

In [21]:
pairs_df = align_documents(qa_gpt_model, pairs_df)
try:
    pairs_df.to_json('output/qudsim.json')
except:
    print("Could not save document dataframe")

Document Alignment: 100%|████████████████████████████████████████████████████| 4/4 [01:50<00:00, 27.66s/it]


In [22]:
pairs_df

Unnamed: 0,Doc1 ID,Doc2 ID,Level,Model 1,Model 2,Document 1,Document 2,Doc1 Numbered Text,Doc2 Numbered Text,Doc1 Sentence_Number_Dict,...,Doc2 QUDs,Doc1 QUD to Segment Dict,Doc2 QUD to Segment Dict,Doc1 Segment to QUD Dict,Doc2 Segment to QUD Dict,D1 to D2 QUD Answers,D2 to D1 QUD Answers,Harmonic QUDsim Score,QUDsim Aligned Segment Indices,QUDsim Aligned Segments
0,28240,76831,1,claude-3-5-sonnet-20241022,gemini-1.5-flash-002,"David Lynch, the surrealist filmmaker and arti...",## David Lynch (1946-2025): A Visionary's Drea...,"[1] David Lynch, the surrealist filmmaker and ...",[1] ## David Lynch (1946-2025): A Visionary's ...,"{1: 'David Lynch, the surrealist filmmaker and...",...,"[{""num_quds"":1,""quds"":[{""qud"":""What is the sig...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 4}","{0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 4, 6: 5, 7: ...","{0: [0], 1: [1], 2: [2], 3: [3, 4], 4: [5]}","{0: [0], 1: [1], 2: [2, 3], 3: [4], 4: [5], 5:...",{'excerpts': [{'question': 'What happened to t...,{'excerpts': [{'question': 'What is the signif...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0],...",[(Lynch's singular vision challenged Hollywood...
1,43796,88150,1,claude-3-5-sonnet-20241022,gpt-4o-2024-08-06,I stared at the black card in my trembling han...,After years of relentless searching and countl...,[1] I stared at the black card in my trembling...,[1] After years of relentless searching and co...,{1: 'I stared at the black card in my tremblin...,...,"[{""num_quds"":1,""quds"":[{""qud"":""What motivates ...","{0: 0, 1: 0, 2: 1, 3: 1, 4: 2, 5: 3, 6: 4}","{0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}","{0: [0, 1], 1: [2, 3], 2: [4], 3: [5], 4: [6]}","{0: [0], 1: [1], 2: [2, 3], 3: [4], 4: [5], 5:...",{'excerpts': [{'question': 'What is the narrat...,{'excerpts': [{'question': 'What motivates the...,"[[0.0, 0.0, 0.31578947368421056, 0.0, 0.0, 0.0...","[[0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0],...",[(I stared at the black card in my trembling h...
2,68659,22968,1,gpt-4o-2024-08-06,gemini-1.5-flash-002,"In the heart of ancient Macedonia, Philip II a...",The sun beat down on the rough-hewn hills of a...,"[1] In the heart of ancient Macedonia, Philip ...",[1] The sun beat down on the rough-hewn hills ...,"{1: 'In the heart of ancient Macedonia, Philip...",...,"[{""num_quds"":2,""quds"":[{""qud"":""How did the lea...","{0: 0, 1: 0, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: ...","{0: 0, 1: 0, 2: 1, 3: 1, 4: 2, 5: 2, 6: 3, 7: ...","{0: [0, 1], 1: [2], 2: [3, 4], 3: [5], 4: [6, ...","{0: [0, 1], 1: [2, 3], 2: [4, 5], 3: [6, 7], 4...",{'excerpts': [{'question': 'Who rises to power...,{'excerpts': [{'question': 'How did the leader...,"[[0.4444444444444445, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0...","[(In the heart of ancient Macedonia, Philip II..."
3,79226,93275,1,Human,gpt-4o-2024-08-06,`` Disengaging from primary module . Do you co...,Sammy always felt more at home when he gazed u...,[1] `` Disengaging from primary module .[2] Do...,[1] Sammy always felt more at home when he gaz...,"{1: '`` Disengaging from primary module .', 2:...",...,"[{""num_quds"":1,""quds"":[{""qud"":""How does the ch...","{0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 3, 6: 4, 7: ...","{0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 3, 6: 4, 7: 5}","{0: [0], 1: [1, 2], 2: [3], 3: [4, 5], 4: [6, ...","{0: [0], 1: [1, 2], 2: [3], 3: [4, 5], 4: [6],...",{'excerpts': [{'question': 'What actions are b...,{'excerpts': [{'question': 'How does the child...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0....","[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0...",[]


A dataframe without all the metadata that's stored in ```pair_df```.

In [23]:
reduced_df = reduce_df(pairs_df)
try:
    reduced_df.to_json('output/reduced_qudsim.json')
except:
    print("Could not save document dataframe")

In [24]:
reduced_df

Unnamed: 0,Document 1,Document 2,Level,Model 1,Model 2,Doc1 QUDs,Doc2 QUDs,Harmonic QUDsim Score,QUDsim Aligned Segment Indices,QUDsim Aligned Segments
0,"David Lynch, the surrealist filmmaker and arti...",## David Lynch (1946-2025): A Visionary's Drea...,1,claude-3-5-sonnet-20241022,gemini-1.5-flash-002,"[{""num_quds"":1,""quds"":[{""qud"":""What happened t...","[{""num_quds"":1,""quds"":[{""qud"":""What is the sig...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0],...",[(Lynch's singular vision challenged Hollywood...
1,I stared at the black card in my trembling han...,After years of relentless searching and countl...,1,claude-3-5-sonnet-20241022,gpt-4o-2024-08-06,"[{""num_quds"":2,""quds"":[{""qud"":""What is the nar...","[{""num_quds"":1,""quds"":[{""qud"":""What motivates ...","[[0.0, 0.0, 0.31578947368421056, 0.0, 0.0, 0.0...","[[0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0],...",[(I stared at the black card in my trembling h...
2,"In the heart of ancient Macedonia, Philip II a...",The sun beat down on the rough-hewn hills of a...,1,gpt-4o-2024-08-06,gemini-1.5-flash-002,"[{""num_quds"":2,""quds"":[{""qud"":""Who rises to po...","[{""num_quds"":2,""quds"":[{""qud"":""How did the lea...","[[0.4444444444444445, 0.0, 0.0, 0.0, 0.0, 0.0]...","[[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0...","[(In the heart of ancient Macedonia, Philip II..."
3,`` Disengaging from primary module . Do you co...,Sammy always felt more at home when he gazed u...,1,Human,gpt-4o-2024-08-06,"[{""num_quds"":1,""quds"":[{""qud"":""What actions ar...","[{""num_quds"":1,""quds"":[{""qud"":""How does the ch...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0....","[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0...",[]


# Examples

Here, we show three examples of aligned text in different domains.

### Obituary Domain
Documents generated by: ```claude-3.5-sonnet``` and ```gemini-1.5-flash```

In [25]:
for aligned_pair in reduced_df['QUDsim Aligned Segments'][0]:
    print("Document 1: ", aligned_pair[0])
    print("Document 2: ", aligned_pair[1])
    print()

Document 1:  Lynch's singular vision challenged Hollywood conventions and created a new visual language in film, television, and art. His dreamlike narratives and dark exploration of suburban America's underbelly earned him both devoted followers and critical acclaim throughout his five-decade career.
Document 2:  From the stark landscapes and fractured narratives of *Eraserhead* to the noir-tinged mystery of *Blue Velvet*, the surrealist fever dream of *Mulholland Drive*, and the darkly comedic television masterpiece *Twin Peaks*, Lynch’s work consistently defied expectations and pushed the boundaries of visual storytelling. He cultivated a distinct aesthetic, a signature style characterized by jarring juxtapositions, unsettling sound design, and a pervasive sense of unease that often lurked beneath the surface of seemingly ordinary events. His work explored the darker corners of the human psyche, delving into themes of obsession, violence, and the unsettling ambiguity of reality itse

### Creative Writing Domain
Documents generated by: ```claude-3.5-sonnet``` and ```gpt-4o```

In [26]:
for aligned_pair in reduced_df['QUDsim Aligned Segments'][1]:
    print("Document 1: ", aligned_pair[0])
    print("Document 2: ", aligned_pair[1])
    print()

Document 1:  I stared at the black card in my trembling hands, the silver numbers seeming to shimmer in the dim light of my laboratory. The hooded figure had vanished as quickly as it had appeared, leaving behind only the faint scent of autumn leaves and earth.
Document 2:  Just as wonder embraced me, a shadow moved swiftly across the room, coalescing into a figure draped in timeless black robes. I blinked, expecting hallucination, but it stood firm in the dim light of my cluttered study. "Congratulations," the figure said in a voice that was remarkably mundane for someone—or something—so terrifying. It pulled a small, black card from inside its robe, flicking it into the air toward me. The card spun like a leaf before landing gently in my hand. "When you realize living forever sucks, call this number. I've got a job offer for you," Death continued, its voice as casual as a friend's. It nodded toward the card, as if urging me to read it. The card was plain, save for an elegantly emboss

### Suri
Documents generated by: ```gpt-4o``` and ```gemini-1.5-flash```

In [27]:
for aligned_pair in reduced_df['QUDsim Aligned Segments'][2]:
    print("Document 1: ", aligned_pair[0])
    print("Document 2: ", aligned_pair[1])
    print()

Document 1:  In the heart of ancient Macedonia, Philip II ascended to the throne in 359 BC, marking the dawn of a transformative era. The kingdom he inherited was fractured and vulnerable, yet Philip, a keen strategist and astute leader, harbored aspirations that stretched far beyond its beleaguered borders. His vision centered on unification and supremacy, beginning a campaign of consolidation that would set the stage for Macedonia's rise.
Document 2:  The sun beat down on the rough-hewn hills of ancient Macedonia, a kingdom previously relegated to the fringes of the Greek world, a land of shepherds and warriors, overshadowed by the glittering city-states to the south. But in the late 4th century BCE, this perception shifted dramatically, thanks to the ambition and military genius of Philip II (382-336 BCE). Philip inherited a fragmented kingdom, beset by internal strife and external threats. His genius lay not just in his ambition, but in his understanding of the power of innovation.