<a href="https://colab.research.google.com/github/17harshitm-star/CLSA-Plato-Upanishads/blob/clsa/CLSA_Plato_Upanishads.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and punctuation (keep letters, numbers, spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove multiple spaces/newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load and clean Plato text
with open('plato_republic.txt', 'r', encoding='utf-8') as f:
    raw_plato = f.read()
clean_plato = clean_text(raw_plato)

# Load and clean Upanishad text
with open('isha_upanishad.txt', 'r', encoding='utf-8') as f:
    raw_upanishad = f.read()
clean_upanishad = clean_text(raw_upanishad)

# Tokenize into sentences on raw text for semantic similarity (keep punctuation for sentence splitting)
plato_sentences = sent_tokenize(raw_plato)
upanishad_sentences = sent_tokenize(raw_upanishad)

# Optional: For word-level operations or further cleaning
stop_words = set(stopwords.words('english'))
def remove_stopwords(sentence):
    tokens = word_tokenize(sentence.lower())
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
    return " ".join(filtered)

# Example usage: cleaned sentences without stopwords
clean_plato_sentences = [remove_stopwords(sent) for sent in plato_sentences]
clean_upanishad_sentences = [remove_stopwords(sent) for sent in upanishad_sentences]

# Now encode and compare with SentenceTransformer as before
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# You can encode either the original sentence lists or the cleaned ones, depending on preferred accuracy/speed tradeoff.
plato_embeddings = model.encode(plato_sentences, convert_to_tensor=True)
upanishad_embeddings = model.encode(upanishad_sentences, convert_to_tensor=True)

cosine_scores = util.pytorch_cos_sim(plato_embeddings, upanishad_embeddings)


from sentence_transformers import SentenceTransformer, util
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Load texts from files
with open('plato_republic.txt', 'r', encoding='utf-8') as f:
    plato_text = f.read()

with open('isha_upanishad.txt', 'r', encoding='utf-8') as f:
    upanishad_text = f.read()

# Split texts into sentences
plato_sentences = sent_tokenize(plato_text)
upanishad_sentences = sent_tokenize(upanishad_text)

# Load the pretrained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode sentences
plato_embeddings = model.encode(plato_sentences, convert_to_tensor=True)
upanishad_embeddings = model.encode(upanishad_sentences, convert_to_tensor=True)

# Aggregate to get document embedding vectors by averaging
plato_doc_embedding = plato_embeddings.mean(dim=0)
upanishad_doc_embedding = upanishad_embeddings.mean(dim=0)

# Calculate overall cosine similarity between whole texts
overall_similarity = util.pytorch_cos_sim(plato_doc_embedding, upanishad_doc_embedding).item()

print(f"Overall Semantic Similarity between the texts: {overall_similarity:.4f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Overall Semantic Similarity between the texts: 0.6530


In [1]:
import collections
if not hasattr(collections, 'Iterable'):
    collections.Iterable = collections.abc.Iterable

%pip install --force-reinstall inltk nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import asyncio

# Patch the event loop's close method to prevent errors if it's already running
try:
    loop = asyncio.get_running_loop()
    if loop:
        _original_loop_close = loop.close
        loop.close = lambda: None # Replace close with a no-op
except RuntimeError: # No running loop, so no need to patch
    loop = None

from inltk.inltk import setup, tokenize
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Setup Sanskrit tokenizer (run once)
setup('sa')

# Restore the original close method if it was patched
if loop:
    loop.close = _original_loop_close

# Load texts: Plato in English, Upanishads in Sanskrit (in Devanagari or standard script)
with open('plato_republic_english.txt', 'r', encoding='utf-8') as f:
    plato_text = f.read()

with open('upanishads_sanskrit.txt', 'r', encoding='utf-8') as f:
    upanishad_text = f.read()

# Split texts into paragraphs (basic heuristic: double newline)
plato_paragraphs = [p.strip() for p in plato_text.split('\n\n') if len(p.strip()) > 20]
upanishad_paragraphs_raw = [p.strip() for p in upanishad_text.split('\n\n') if len(p.strip()) > 20]

# Sanskrit tokenization using inltk
# Tokenize each Sanskrit paragraph; join tokens back to string for embedding
upanishad_paragraphs = [' '.join(tokenize(p, 'sa')) for p in upanishad_paragraphs_raw]

# English paragraphs remain unchanged
tokenized_plato = plato_paragraphs

# Load a multilingual embedding model supporting English and Sanskrit (e.g., XLM-R)
model = SentenceTransformer('sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens')

# Get sentence embeddings (tensor)
plato_embeddings = model.encode(tokenized_plato, convert_to_tensor=True)
upanishad_embeddings = model.encode(upanishad_paragraphs, convert_to_tensor=True)

# Compute max similarity directional measures
# Plato paragraphs to Upanishads
plato_to_upanishad_sim = util.pytorch_cos_sim(plato_embeddings, upanishad_embeddings).max(dim=1).values.cpu().numpy()

# Upanishads paragraphs to Plato
upanishad_to_plato_sim = util.pytorch_cos_sim(upanishad_embeddings, plato_embeddings).max(dim=1).values.cpu().numpy()

# Calculate average directional semantic dependency
plato_depends_on_upanishad = np.mean(plato_to_upanishad_sim)
upanishad_depends_on_plato = np.mean(upanishad_to_plato_sim)

print(f"Average semantic similarity (Plato \u2192 Upanishads): {plato_depends_on_upanishad:.4f}")
print(f"Average semantic similarity (Upanishads \u2192 Plato): {upanishad_depends_on_plato:.4f}")

if plato_depends_on_upanishad > upanishad_depends_on_plato:
    print("Plato's Republic shows stronger semantic dependency on the Upanishads.")
else:
    print("Upanishads show stronger semantic dependency on Plato's Republic or influence is unclear.")

Collecting inltk
  Using cached inltk-0.9-py3-none-any.whl.metadata (40 kB)
Collecting nest_asyncio
  Using cached nest_asyncio-1.6.0-py3-none-any.whl.metadata (2.8 kB)
Collecting aiohttp>=3.5.4 (from inltk)
  Using cached aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting async-timeout>=3.0.1 (from inltk)
  Using cached async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting Pillow (from inltk)
  Using cached pillow-12.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.8 kB)
Collecting beautifulsoup4 (from inltk)
  Using cached beautifulsoup4-4.14.2-py3-none-any.whl.metadata (3.8 kB)
Collecting bottleneck (from inltk)
  Using cached bottleneck-1.6.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (8.2 kB)
Collecting fastprogress>=0.1.19 (from inltk)
  Using cached fastprogress-1.0.3-py3-none-any.whl.metadata (5.6 kB)
Collecting matplotlib (

  import pkg_resources


Done!


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Average semantic similarity (Plato → Upanishads): 0.6577
Average semantic similarity (Upanishads → Plato): 0.6099
Plato's Republic shows stronger semantic dependency on the Upanishads.


In [3]:
from sentence_transformers import SentenceTransformer, util

# Assuming plato_paragraphs and upanishad_paragraphs are your preprocessed text chunks
# And plato_embeddings, upanishad_embeddings are computed embeddings from your model

min_words = 10  # minimum number of words required in each passage

# Compute full cosine similarity matrix
cosine_scores = util.pytorch_cos_sim(plato_embeddings, upanishad_embeddings).cpu().numpy()

# Find the highest similarity pairs:
max_sim_indices = cosine_scores.argmax(axis=1)
max_sim_values = cosine_scores.max(axis=1)

# Filter pairs by minimum number of words in both passages
filtered_pairs = []
for i, idx in enumerate(max_sim_indices):
    plato_text = plato_paragraphs[i]
    upanishad_text = upanishad_paragraphs[idx]
    if len(plato_text.split()) >= min_words and len(upanishad_text.split()) >= min_words:
        filtered_pairs.append((i, idx, max_sim_values[i], plato_text, upanishad_text))

# Sort filtered pairs by similarity descending
filtered_pairs.sort(key=lambda x: x[2], reverse=True)

print(f"Top 5 High Similarity Pairs (min {min_words} words each) (Plato → Upanishads):")
for i, (pl_idx, up_idx, sim, pl_text, up_text) in enumerate(filtered_pairs[:5]):
    print(f"Similarity: {sim:.4f}")
    print(f"Plato paragraph: {pl_text}")
    print(f"Upanishad paragraph: {up_text}")
    print()

# Similarly, for low similarity pairs, filtering can be applied if desired
min_sim_values = cosine_scores.min(axis=1)
min_sim_indices = cosine_scores.argmin(axis=1)

filtered_low_pairs = []
for i, idx in enumerate(min_sim_indices):
    plato_text = plato_paragraphs[i]
    upanishad_text = upanishad_paragraphs[idx]
    if len(plato_text.split()) >= min_words and len(upanishad_text.split()) >= min_words:
        filtered_low_pairs.append((i, idx, min_sim_values[i], plato_text, upanishad_text))

filtered_low_pairs.sort(key=lambda x: x[2])

print(f"Top 5 Low Similarity Pairs (min {min_words} words each) (Plato → Upanishads):")
for i, (pl_idx, up_idx, sim, pl_text, up_text) in enumerate(filtered_low_pairs[:5]):
    print(f"Similarity: {sim:.4f}")
    print(f"Plato paragraph: {pl_text}")
    print(f"Upanishad paragraph: {up_text}")
    print()


Top 5 High Similarity Pairs (min 10 words each) (Plato → Upanishads):
Similarity: 0.8489
Plato paragraph: Thus, Glaucon, the tale has been saved, and will be our salvation, if
we believe that the soul is immortal, and hold fast to the heavenly way
of Justice and Knowledge. So shall we pass undefiled over the river of
Forgetfulness, and be dear to ourselves and to the Gods, and have a
crown of reward and happiness both in this world and also in the
millennial pilgrimage of the other.
Upanishad paragraph: ▁ब्रह्म सुख स्वरूपं ▁ विर ाद यति - - ▁स्वस्थ मिति ▁। ▁ स्ते ▁ महिम्न ि ▁स्वयं ▁तिष्ठ तीति ▁स्वस्थ ं ▁“ स्व े ▁मह िन् नि ▁समास त ▁ ‡ ▁इति ▁श्रुत े ः ▁। ▁सर्व ान धौ प शान्ति सि द्र त्वात् ▁शान्त म् ▁1 ▁निर् वृत िः ▁मुक्तिः ▁तद ू प ेण ▁सह ▁ वर्तन ात् ▁स निर्वाण म् ▁। ▁इदम ि त्थ मि ्या - ▁ख्यात ुम दा क्य त्वात् ▁अक थ्य ं ▁वै ष य िक सुख टु ः खा प ह व सिद्ध ं ॑ पार म ाधिक सुख म् ▁।

Similarity: 0.8443
Plato paragraph: The idea of good is one of those sacred words or forms of thought,
which we