In [15]:
!pip install datasketch --quiet
!pip install tqdm --quiet
!pip install pyvis --quiet
!pip install scipy --quiet
!pip install numpy --quiet

[33mDEPRECATION: Loading egg at /home/anguelos/.local/lib/python3.11/site-packages/torchvision-0.15.2-py3.11-linux-x86_64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /home/anguelos/.local/lib/python3.11/site-packages/fonttools-4.42.1-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /home/anguelos/anaconda3/envs/kraken_2/lib/python3.11/site-packages/werkzeug-3.0.1-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: L

In [86]:
!pwd

/home/anguelos/tmp/didip_ss/DATA


In [None]:
# Import Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
%cd /content/drive/MyDrive/didip_ss/DATA

/content/drive/MyDrive/didip_ss/DATA


In [3]:
%cd ../DATA

/home/anguelos/tmp/didip_ss/DATA


# Text reuse analytics with MiniHash

This code is designed to find similar text passages between two datasets (a simlified part of the TRACE: https://github.com/kreeedit/trace), which could be useful for plagiarism detection, document similarity analysis, or identifying common themes across different text corpora.

**Preprocessing**

This function standardizes the text for consistent processing.
- Removes extra whitespace
- Converts text to lowercase
- Removes punctuation


**Shingle generation**

Shingling is a technique used to break text into smaller, overlapping pieces for comparison.

- Splits the text into overlapping k-word shingles
- Default shingle size is 4 words


```
Original text:
The quick brown fox jumps over the lazy dog

Shingles (k=4):
1. the quick brown fox
2. quick brown fox jumps
3. brown fox jumps over
4. fox jumps over the
5. jumps over the lazy
6. over the lazy dog

```


**DataFrame processing**

This function prepares the data for similarity comparison.
- Iterates through each row in the DataFrame
- Preprocesses the text
- Creates shingles
- Generates MinHash signatures
- Stores the processed data in a dictionary

**Creating minihash**

MinHash is a probabilistic data structure used for efficient similarity estimation.

- Creates a MinHash object with 128 permutations by default
  - It creates a fixed-size signature regardless of input size.
- Updates the MinHash with each shingle
  - It preserves similarity relationships between sets.
- MinHash is a probabilistic data structure used for efficient similarity estimation.
  - It allows for quick similarity estimation without needing to store or compare entire sets of data.

**Comparing signatures**

This function performs the actual similarity comparison.

- Compares shingles between texts from two DataFrames
- Finds and stores shared shingles between texts



## Comparison of different text similarity methods

| Method | Pros | Cons |
|--------|------|------|
| Exact string matching | - Simple<br>- Accurate for identical texts | - Inflexible<br>- Sensitive to minor changes |
| Edit distance (e.g., Levenshtein) | - Captures character-level differences | - Computationally expensive for large texts<br>- Doesn't capture semantic similarity |
| TF-IDF with cosine similarity | - Captures word importance<br>- Works well for document-level similarity | - Doesn't capture word order<br>- Can be slow for large corpora |
| Word embeddings (e.g., Word2Vec, GloVe) | - Captures semantic similarity<br>- Works well for short texts | - Loses some context<br>- Can be computationally expensive |
| MinHash with shingling | - Scalable<br>- Preserves local structure<br>- Flexible similarity threshold<br>- Language-agnostic<br>- Memory-efficient | - Probabilistic (may miss some matches)<br>- Requires careful parameter tuning |





In [73]:
import json
from datasketch import MinHash
from tqdm import tqdm
import pandas as pd
import string
import re

def preprocess_text(text):
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

def shingle_text(text, k=4):
    """Splits text into overlapping k-word shingles."""
    words = text.split()
    return [" ".join(words[i : i + k]) for i in range(len(words) - k + 1)]

def process_dataframe(df, text_column, id_column, k=4):
    """Processes text from a DataFrame column, creating MinHash signatures and storing shingles."""
    file_data = {}
    for _, row in tqdm(df.iterrows(), desc=f"Processing {text_column}", total=len(df), unit="row"):
        text = preprocess_text(row[text_column])  # Apply preprocessing
        shingles = shingle_text(text, k)
        file_data[row[id_column]] = {
            "text": text,
            "shingles": shingles,
            #"minhash": create_minhash(shingles)
        }
    return file_data

def create_minhash(shingles, num_perm=128):
    """Creates a MinHash signature for a set of shingles."""
    m = MinHash(num_perm=num_perm)
    for shingle in shingles:
        m.update(shingle.encode("utf-8"))
    return m

def compare_signatures(signatures1, signatures2, df1_name, df2_name, id1_name, id2_name):
    """Compares all shingles between texts from two DataFrame columns and finds similar shingles."""
    similar_texts = []
    for id1, data1 in tqdm(signatures1.items(), desc=f"Comparing {df1_name} with {df2_name}", unit="text"):
        for id2, data2 in signatures2.items():
            shared_shingles = set(data1["shingles"]).intersection(set(data2["shingles"]))
            if shared_shingles:
                similar_texts.append(
                    {
                        f"{df1_name}_{id1_name}": id1,
                        f"{df2_name}_{id2_name}": id2,
                        "shared_shingles": list(shared_shingles),
                    }
                )
    return similar_texts


# DataFrames
df1 = pd.read_csv('mom_1000_sample.tsv', sep='\t')
df2 = pd.read_csv('vulgate_sample_1000.tsv', sep='\t')

# Specify column names containing the text data, and ID column names
df1_name = 'mom'
df2_name = 'vulgate'
df1_text_column = 'text'
df2_text_column = 'text'
df1_id_column = 'atom_id'
df2_id_column = 'id'

# Process DataFrames
signatures1 = process_dataframe(df1, df1_text_column, df1_id_column, k=4)
signatures2 = process_dataframe(df2, df2_text_column, df2_id_column, k=4)

# Compare the DataFrames
results = compare_signatures(signatures1, signatures2, df1_name, df2_name, df1_id_column, df2_id_column)

# Save results to JSON
with open("similarity_results.json", "w") as f:
    json.dump(results, f, indent=2)

# Display the first few results
print(json.dumps(results[:5], indent=2))

Processing text: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1786.19row/s]
Processing text: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1189/1189 [00:00<00:00, 1413.22row/s]
Comparing mom with vulgate: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:46<00:00, 21.51text/s]

[
  {
    "mom_atom_id": "OOEUB/0853_I_18",
    "vulgate_id": "Genesis_chapter_25",
    "shared_shingles": [
      "et post obitum illius"
    ]
  },
  {
    "mom_atom_id": "AT-StiASei/SeitenstettenOSB/1240_III_16",
    "vulgate_id": "Matthew_chapter_28",
    "shared_shingles": [
      "filii et spiritus sancti",
      "nomine patris et filii",
      "patris et filii et",
      "et filii et spiritus",
      "in nomine patris et"
    ]
  },
  {
    "mom_atom_id": "AT-StiAG/GoettweigOSB/1327_IV_24",
    "vulgate_id": "Joshua_chapter_22",
    "shared_shingles": [
      "inter nos et vos"
    ]
  },
  {
    "mom_atom_id": "AT-StiAG/GoettweigOSB/1327_IV_24",
    "vulgate_id": "Luke_chapter_16",
    "shared_shingles": [
      "inter nos et vos"
    ]
  },
  {
    "mom_atom_id": "AT-StiAKr/KremsmuensterOSB/1247_I_21",
    "vulgate_id": "John_chapter_10",
    "shared_shingles": [
      "de manibus eorum et"
    ]
  }
]





270


In [83]:
import numpy as np
from scipy.sparse import coo_matrix

def load_signatures(signatures):
    names, shingles, shingle_tuples = [], [], []
    for name, data in signatures.items():
        shingle_tuples.append(data["shingles"])
        shingle = np.array([hash(d) for d in data["shingles"]], dtype=np.int64)
        shingles.append(shingle)
        names.append(name)
    name_idx = np.concatenate([np.zeros_like(sh)+n for n , sh in enumerate(shingles)]).astype(np.int32)
    names = np.array(names)
    return names, name_idx, np.concatenate(shingles, axis=0), np.concatenate(shingle_tuples, axis=0)

def common_shingles_list(shingle_list1, shingle_list2):
    res = sorted(set(shingle_list1).intersection(set(shingle_list2)))
    return [" ".join(shingle) for shingle in res]


def compare_signatures_fast(signatures1, signatures2, df1_name, df2_name, id1_name, id2_name):
    # packaging data to numpy
    names1, name_idx1, shingles1, shingle_tuples1 = load_signatures(signatures1)
    names2, name_idx2, shingles2, shingle_tuples2 = load_signatures(signatures2)
    # compressing hash namespace to continuous integers
    all_shingles = np.concatenate([shingles1, shingles2], axis=0)
    unique_shingles = np.unique(all_shingles)
    hash2dense = {v:n for n, v in enumerate(unique_shingles)}
    dense1 = np.array([hash2dense[sh] for sh in shingles1.tolist()],dtype=np.int32)
    dense2 = np.array([hash2dense[sh] for sh in shingles2.tolist()],dtype=np.int32)

    # creating sparse shingleid x doc sparse matrices
    matrix1 = coo_matrix((np.ones_like(dense1), (name_idx1, dense1)), shape=(names1.size, unique_shingles.size))
    matrix2 = coo_matrix((np.ones_like(dense2), (dense2, name_idx2)), shape=(unique_shingles.size, names2.size))

    #finding cooccurences (this is the reuse metric)
    cooccurence_mat = matrix1.dot(matrix2).todense()

    # extracting captions of common shingles for visualisation
    idx1_mat, idx2_mat = np.nonzero(cooccurence_mat)
    similar_texts = []
    for idx1, idx2 in tqdm(zip(idx1_mat.tolist(), idx2_mat.tolist())):
        shingle_list1 = signatures1[names1[idx1]]["shingles"]
        shingle_list2 = signatures2[names2[idx2]]["shingles"]
        common_shingles = common_shingles_list(shingle_list1, shingle_list2)
        similar_texts.append({f"{df1_name}_{id1_name}": names1[idx1], f"{df2_name}_{id2_name}": names2[idx2], 
                                            "shared_shingles": common_shingles,})
    return similar_texts


results = compare_signatures_fast(signatures1, signatures2, df1_name, df2_name, df1_id_column, df2_id_column)
print(len(results))

(872283,)
(807554,)
1
int32
int32
(1000, 1189) 43


270it [00:00, 18170.56it/s]


270


In [84]:
import json
import pyvis
from pyvis.network import Network
import networkx as nx

# Load the JSON data
with open('similarity_results.json', 'r') as f:
    data = json.load(f)

# Create a NetworkX graph
G = nx.Graph()

# Add nodes and edges
for item in data:
    mom_id = item['mom_atom_id']
    vulgate_id = item['vulgate_id']
    weight = len(item['shared_shingles'])

    G.add_node(mom_id, color='lightblue', title=f"Manuscript: {mom_id}")
    G.add_node(vulgate_id, color='lightgreen', title=f"Vulgate: {vulgate_id}")
    G.add_edge(mom_id, vulgate_id, weight=weight, title=f"Shared shingles: {weight}")

# Create a Pyvis network from the NetworkX graph
net = Network(notebook=False, width="100%", height="600px", bgcolor="#ffffff", font_color="black")
net.from_nx(G)

# Set options for a more spread out graph
net.set_options("""
var options = {
  "nodes": {
    "font": {
      "size": 12
    }
  },
  "edges": {
    "color": {
      "inherit": true
    },
    "smooth": false
  },
  "physics": {
    "forceAtlas2Based": {
      "gravitationalConstant": -50,
      "centralGravity": 0.01,
      "springLength": 100,
      "springConstant": 0.08
    },
    "maxVelocity": 50,
    "solver": "forceAtlas2Based",
    "timestep": 0.35,
    "stabilization": {
      "enabled": true,
      "iterations": 1000,
      "updateInterval": 25
    }
  }
}
""")

# Save the graph as an HTML file
net.save_graph("similarity_network.html")
print("Graph saved as 'similarity_network.html'")

Graph saved as 'similarity_network.html'


## Network visualization


In [85]:
from IPython.display import HTML, display
with open('similarity_network.html', 'r') as f:
    html_content = f.read()
display(HTML(html_content))