In [1]:
!pip install pymupdf



In [2]:
import fitz  # PyMuPDF
import csv

In [3]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_document = fitz.open(pdf_path)
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        text += page.get_text()
    return text

In [4]:
# Ayurveda terms categorized into various classes
ayurveda_terms = {
    'ingredients': [
        'turmeric', 'ashwagandha', 'ginger', 'amla', 'ghee', 'honey',
        'triphala', 'tulsi', 'brahmi', 'neem', 'shatavari', 'arjuna',
        'licorice', 'gokshura', 'haritaki', 'pippali', 'cumin', 'coriander',
        'rakta', 'musta', 'guduchi', 'keshar', 'satavari', 'yastimadhu', 'tankan'
    ],
    'symptoms': [
        'fever', 'cough', 'pain', 'headache', 'nausea', 'vomiting',
        'diarrhea', 'constipation', 'fatigue', 'dizziness', 'indigestion',
        'swelling', 'itching', 'rash', 'sweating', 'tremors', 'shortness of breath',
        'pitta', 'vata', 'kapha', 'burning sensation', 'sore throat', 'loss of appetite',
        'joint pain', 'numbness', 'dryness'
    ],
    'treatments': [
        'panchakarma', 'abhyanga', 'basti', 'nasya', 'shirodhara',
        'virechana', 'vaman', 'rakta moksha', 'kati basti', 'udvartana',
        'swedana', 'shirolepa', 'dhanyamla dhara', 'anjanam', 'lepana',
        'yoga', 'meditation', 'triphala', 'svedana', 'netra tarpana',
        'nadi shodhana', 'anjanam', 'talam', 'marma therapy'
    ],
    'diseases': [
        'arthritis', 'diabetes', 'asthma', 'eczema', 'psoriasis', 'obesity',
        'hypertension', 'ulcers', 'anemia', 'bronchitis', 'sinusitis',
        'acne', 'gout', 'indigestion', 'dyspepsia', 'gastritis',
        'insomnia', 'migraine', 'constipation', 'jwara', 'jaundice', 'piles',
        'urinary tract infections', 'hemorrhoids', 'depression', 'anxiety',
        'paralysis', 'leprosy', 'malarial fever'
    ],
    'medical_plants': [
        'tulsi', 'neem', 'brahmi', 'ashwagandha', 'amla', 'shatavari',
        'giloy', 'moringa', 'arjuna', 'triphala', 'vidanga', 'guggulu',
        'kalmegh', 'bhumi amla', 'jatamansi', 'manjistha', 'punarnava',
        'bakuchi', 'kesar', 'kapikacchu', 'shankhpushpi', 'bilva', 'vidari',
        'kushta', 'patala', 'sariva', 'shigru', 'guduchi'
    ]
}

In [5]:
# Function to find sentences with Ayurveda terms
def find_sentences(text, terms_dict):
    sentences = text.split('.')
    term_sentences = []

    for sentence in sentences:
        for category, terms in terms_dict.items():
            for term in terms:
                if term.lower() in sentence.lower():
                    term_sentences.append([term, sentence.strip(), category])

    return term_sentences

In [6]:
# Function to write data to CSV with correct SL No.
def write_to_csv(rows, output_csv):
    headers = ['SL No.', 'Ayurveda Term', 'Sentence', 'Classification']

    with open(output_csv, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)

        for index, row in enumerate(rows, start=1):
            writer.writerow([index] + row)


In [7]:
# Function to extract and process terms while ensuring uniqueness
def process_pdf_to_csv(pdf_path, output_csv):
    text = extract_text_from_pdf(pdf_path)
    ayurveda_term_sentences = find_sentences(text, ayurveda_terms)
    unique_terms = set()  # Set to store distinct terms

    # Process and store unique terms with their context and classification
    rows = []
    for term_info in ayurveda_term_sentences:
        term = term_info[0]  # Access the term
        if term not in unique_terms:
            unique_terms.add(term)
            rows.append(term_info)  # Add the term only if it's not a duplicate

    # Write distinct rows to the CSV with correct serial numbers
    write_to_csv(rows, output_csv)

In [8]:

pdf_path = “Disease explanation in charaka samhita made easy - Ebook.pdf”
output_csv = "distinct_ayurveda_terms_classification.csv"
process_pdf_to_csv(pdf_path, output_csv)
print(f"Extraction and classification completed. Data saved to {output_csv}")

Extraction and classification completed. Data saved to distinct_ayurveda_terms_classification.csv
