In [1]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        full_text = ''
        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            if text:
                full_text += f"\n\n--- PAGE {page_num + 1} ---\n\n"
                full_text += text
        return full_text

books = [
    {"pdf": "Lippincott_Illustrated_Reviews_Pharmacology_7th.pdf", "txt": "data/lippincott_extracted.txt"},
    {"pdf": "New-Vital-First-Aid-First-Aid-Book-112019.pdf", "txt": "data/first_aid_extracted.txt"},
    {"pdf": "pain_wise_a_patients_guide_to_pain_management_1nbsped_1578264081.pdf", "txt": "data/pain_management_extracted.txt"}
]

for book in books:
    try:
        print(f"جارٍ استخراج النص من {book['pdf']} ...")
        pdf_text = extract_text_from_pdf(book['pdf'])
        with open(book['txt'], 'w', encoding='utf-8') as f:
            f.write(pdf_text)
        print(f"تم الانتهاء من {book['txt']}")
    except Exception as e:
        print(f"حدث خطأ أثناء معالجة {book['pdf']}: {e}")

print("✅ تم استخراج جميع النصوص.")


جارٍ استخراج النص من Lippincott_Illustrated_Reviews_Pharmacology_7th.pdf ...
حدث خطأ أثناء معالجة Lippincott_Illustrated_Reviews_Pharmacology_7th.pdf: [Errno 2] No such file or directory: 'Lippincott_Illustrated_Reviews_Pharmacology_7th.pdf'
جارٍ استخراج النص من New-Vital-First-Aid-First-Aid-Book-112019.pdf ...
حدث خطأ أثناء معالجة New-Vital-First-Aid-First-Aid-Book-112019.pdf: [Errno 2] No such file or directory: 'New-Vital-First-Aid-First-Aid-Book-112019.pdf'
جارٍ استخراج النص من pain_wise_a_patients_guide_to_pain_management_1nbsped_1578264081.pdf ...
حدث خطأ أثناء معالجة pain_wise_a_patients_guide_to_pain_management_1nbsped_1578264081.pdf: [Errno 2] No such file or directory: 'pain_wise_a_patients_guide_to_pain_management_1nbsped_1578264081.pdf'
✅ تم استخراج جميع النصوص.


In [2]:
import re
import json

def split_chapters(text, keyword="CHAPTER"):
    pattern = rf'({keyword} \d+[\s\S]*?)(?={keyword} \d+|$)'
    chapters_raw = re.findall(pattern, text, re.IGNORECASE)

    chapters = []
    for i, ch in enumerate(chapters_raw):
        title_match = re.search(rf'{keyword} \d+[^\\n]*', ch, re.IGNORECASE)
        title = title_match.group(0) if title_match else f"{keyword} {i + 1}"
        summary = ' '.join(ch.split()[:50]) + "..."

        chapters.append({
            "chapter_number": i + 1,
            "title": title.strip(),
            "summary": summary.strip(),
            "content": ch.strip(),
            "keywords": [],
            "image": ""
        })

    return chapters

with open('data/lippincott_extracted.txt', 'r', encoding='utf-8') as f:
    full_text = f.read()

chapters = split_chapters(full_text, keyword="CHAPTER")

with open('data/lippincott_chapters.json', 'w', encoding='utf-8') as f:
    json.dump(chapters, f, ensure_ascii=False, indent=4)

print("✅ تم تقسيم كتاب الدوائية.")


✅ تم تقسيم كتاب الدوائية.


In [3]:
import re
import json

def split_topics(text, keyword="Lesson"):
    pattern = rf'({keyword} \d+[\s\S]*?)(?={keyword} \d+|$)'
    topics_raw = re.findall(pattern, text, re.IGNORECASE)

    topics = []
    for i, tp in enumerate(topics_raw):
        title_match = re.search(rf'{keyword} \d+[^\\n]*', tp, re.IGNORECASE)
        title = title_match.group(0) if title_match else f"{keyword} {i + 1}"
        summary = ' '.join(tp.split()[:50]) + "..."

        topics.append({
            "topic_number": i + 1,
            "title": title.strip(),
            "summary": summary.strip(),
            "content": tp.strip(),
            "keywords": [],
            "image": ""
        })

    return topics

with open('data/first_aid_extracted.txt', 'r', encoding='utf-8') as f:
    full_text = f.read()

topics = split_topics(full_text, keyword="Lesson")

with open('data/first_aid_topics.json', 'w', encoding='utf-8') as f:
    json.dump(topics, f, ensure_ascii=False, indent=4)

print("✅ تم تقسيم كتاب الإسعاف الأولي.")


✅ تم تقسيم كتاب الإسعاف الأولي.


In [4]:
import re
import json

def split_topics(text, keyword="Lesson"):
    pattern = rf'({keyword} \d+[\s\S]*?)(?={keyword} \d+|$)'
    topics_raw = re.findall(pattern, text, re.IGNORECASE)

    topics = []
    for i, tp in enumerate(topics_raw):
        title_match = re.search(rf'{keyword} \d+[^\\n]*', tp, re.IGNORECASE)
        title = title_match.group(0) if title_match else f"{keyword} {i + 1}"
        summary = ' '.join(tp.split()[:50]) + "..."

        topics.append({
            "topic_number": i + 1,
            "title": title.strip(),
            "summary": summary.strip(),
            "content": tp.strip(),
            "keywords": [],
            "image": ""
        })

    return topics

with open('data/pain_management_extracted.txt', 'r', encoding='utf-8') as f:
    full_text = f.read()

topics = split_topics(full_text, keyword="Lesson")

with open('data/pain_management_chapters.json', 'w', encoding='utf-8') as f:
    json.dump(topics, f, ensure_ascii=False, indent=4)

print("✅ تم تقسيم .")


✅ تم تقسيم .


In [5]:
import json

def load_json_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

lippincott_data = load_json_file('data/lippincott_chapters.json')
first_aid_data = load_json_file('data/first_aid_topics.json')
pain_mgmt_data = load_json_file('data/pain_management_chapters.json')

for item in lippincott_data:
    item['category'] = 'pharmacology'
    item['language'] = 'en'

for item in first_aid_data:
    item['category'] = 'first_aid'
    item['language'] = 'en'

for item in pain_mgmt_data:
    item['category'] = 'pain_management'
    item['language'] = 'en'

merged_data = lippincott_data + first_aid_data + pain_mgmt_data

with open('data/medical_knowledge.json', 'w', encoding='utf-8') as f:
    json.dump(merged_data, f, ensure_ascii=False, indent=4)

print("✅ تم دمج جميع الكتب.")


✅ تم دمج جميع الكتب.


In [6]:
import json
import spacy
import re

nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    return re.sub(r'\d+|\W+', ' ', text.lower()).strip()

def extract_keywords(text, limit=5):
    doc = nlp(clean_text(text))
    keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'VERB'] and len(token.text) > 3]
    return list(set(keywords))[:limit]

with open('data/medical_knowledge.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

for item in data:
    combined = item.get('title', '') + ' ' + item.get('summary', '') + ' ' + item.get('content', '')
    item['keywords'] = extract_keywords(combined)

with open('data/medical_knowledge_with_keywords.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("✅ تم إضافة الكلمات المفتاحية.")


✅ تم إضافة الكلمات المفتاحية.


In [7]:
import json
import re
from difflib import get_close_matches

with open('data/medical_knowledge_with_keywords.json', 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)

def clean_text(text):
    return re.sub(r'[^\w\s]', '', text.lower())

def find_best_match(question):
    question = clean_text(question)
    matches = []

    for item in knowledge_base:
        title = clean_text(item['title'])
        summary = clean_text(item.get('summary', ''))
        content = clean_text(item.get('content', ''))

        if question in title or question in summary or question in content:
            matches.append(item)

    if not matches:
        all_titles = [clean_text(item['title']) for item in knowledge_base]
        close_matches = get_close_matches(question, all_titles, n=1, cutoff=0.5)
        if close_matches:
            best_title = close_matches[0]
            for item in knowledge_base:
                if clean_text(item['title']) == best_title:
                    return item

    return matches[0] if matches else None


In [8]:
import json
import re
from difflib import get_close_matches
import spacy
import streamlit as st

# تحميل النموذج اللغوي
nlp = spacy.load("en_core_web_sm")

# تحميل قاعدة البيانات
with open('data/medical_knowledge_with_keywords.json', 'r', encoding='utf-8') as f:
    knowledge_base = json.load(f)

def clean_text(text):
    return re.sub(r'[^\w\s]', '', text.lower())

def find_best_match(question):
    question = clean_text(question)
    matches = []

    for item in knowledge_base:
        title = clean_text(item['title'])
        summary = clean_text(item.get('summary', ''))
        content = clean_text(item.get('content', ''))

        if question in title or question in summary or question in content:
            matches.append(item)

    if not matches:
        all_titles = [clean_text(item['title']) for item in knowledge_base]
        close_matches = get_close_matches(question, all_titles, n=1, cutoff=0.5)
        if close_matches:
            best_title = close_matches[0]
            for item in knowledge_base:
                if clean_text(item['title']) == best_title:
                    return item

    return matches[0] if matches else None

# واجهة المستخدم
st.title("🤖 الشات بوت الطبي")
st.markdown("اكتب سؤالك الطبي وسيقوم البوت بإيجاد المعلومات المناسبة من الكتب.")

user_input = st.text_input("اكتب سؤالك هنا...")

if user_input:
    result = find_best_match(user_input)
    if result:
        st.subheader("🔍 تم العثور على نتيجة:")
        st.markdown(f"**العنوان:** {result['title']}")
        st.markdown(f"**الملخص:** {result.get('summary', 'لا يوجد ملخص')}")
        st.markdown(f"**المصدر:** {result.get('category', '-')}")
    else:
        st.warning("عذرًا، لا توجد معلومات متاحة لهذا السؤال.")

2025-05-10 04:24:50.910 
  command:

    streamlit run C:\Users\acer\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-10 04:24:50.915 Session state does not function when running a script without `streamlit run`
