In [1]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.10 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.10 PyMuPDFb-1.24.10


In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + ' '
    return text


In [4]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def identify_ayurvedic_terms(text):
    pattern = r'\b[a-zA-Z]{3,}\b'  # Match words with at least three letters
    matches = re.findall(pattern, text)

    filtered_terms = set(word.lower() for word in matches if word.lower() not in stopwords.words('english'))

    return filtered_terms


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
from collections import defaultdict

def define_relationships(sentences, ayurvedic_terms):
    relationships = defaultdict(set)

    for sentence in sentences:
        found_terms = set(term for term in ayurvedic_terms if term in sentence.lower())
        for term1 in found_terms:
            for term2 in found_terms:
                if term1 != term2:
                    relationships[term1].add(term2)

    return relationships


In [6]:
!pip install rdflib

Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib)
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-7.0.0


In [7]:
from rdflib import Graph, URIRef, Literal, Namespace

def create_knowledge_graph(relationships):
    g = Graph()
    ayurveda_ns = Namespace("http://example.org/ayurveda/")

    for term, related_terms in relationships.items():
        term_uri = URIRef(ayurveda_ns[term])
        g.add((term_uri, URIRef(ayurveda_ns['type']), Literal('AyurvedicTerm')))

        for related_term in related_terms:
            related_uri = URIRef(ayurveda_ns[related_term])
            g.add((term_uri, URIRef(ayurveda_ns['relatedTo']), related_uri))

    return g


In [8]:
def save_knowledge_graph(graph, filename):
    graph.serialize(destination=filename, format='turtle')  # You can change format as needed


In [11]:
!pip install nltk
import nltk

nltk.download('punkt')

def main(pdf_path, graph_filename):
    # Extract text from the PDF
    text = extract_text_from_pdf(pdf_path)

    # Identify Ayurvedic terms using regex
    ayurvedic_terms = identify_ayurvedic_terms(text)

    # Split text into sentences for relationship extraction
    sentences = nltk.sent_tokenize(text)

    # Define relationships between terms
    relationships = define_relationships(sentences, ayurvedic_terms)

    # Create the knowledge graph
    knowledge_graph = create_knowledge_graph(relationships)

    # Save the knowledge graph to a file
    save_knowledge_graph(knowledge_graph, graph_filename)

    print("Knowledge graph created and saved to", graph_filename)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
pdf_path = 'Disease explanation in charaka samhita made easy - Ebook-7-42.pdf'  # Replace with your PDF file path
graph_filename = 'ayurvedic_terms_knowledge_graph.ttl'  # Output graph file
main(pdf_path, graph_filename)

Knowledge graph created and saved to ayurvedic_terms_knowledge_graph.ttl
