# Step 00: download libs and backages

## Java for Standford POS tagger

In [12]:
import os
import sys
import subprocess

def is_running_in_colab():
    """
    Check if the code is running in Google Colab environment
    
    Returns:
        bool: True if running in Colab, False otherwise
    """
    try:
        import google.colab
        return True
    except ImportError:
        return False

def is_running_in_jupyter():
    """
    Check if the code is running in Jupyter notebook
    
    Returns:
        bool: True if running in Jupyter, False otherwise
    """
    try:
        from IPython import get_ipython
        if 'IPKernelApp' in get_ipython().config:
            return True
        else:
            return False
    except:
        return False

def setup_java_environment():
    """
    Setup Java environment automatically based on the platform
    
    Raises:
        EnvironmentError: If Java setup fails locally
    """
    
    if is_running_in_colab():
        print("Detected Google Colab environment - Setting up Java...")
        
        # Install JDK in Colab
        try:
            print("Installing OpenJDK 8...")
            subprocess.run([
                'apt-get', 'update', 
                '&&', 'apt-get', 'install', '-y', 'openjdk-8-jdk'
            ], check=True, shell=True)
            
            # Set environment variables
            java_home = '/usr/lib/jvm/java-8-openjdk-amd64'
            os.environ['JAVA_HOME'] = java_home
            os.environ['PATH'] = f"{java_home}/bin:{os.environ['PATH']}"
            
            print("Java setup completed successfully in Colab")
            
        except subprocess.CalledProcessError as e:
            print(f"Failed to install Java in Colab: {e}")
            raise EnvironmentError("Java installation failed in Colab")
        
        # Check if Java is already installed locally
        try:
            result = subprocess.run(
                ['java', '-version'], 
                capture_output=True, 
                text=True, 
                timeout=10
            )
            if result.returncode == 0:
                print("Java is already installed locally")
                # Extract and set JAVA_HOME if not set
                if not os.environ.get('JAVA_HOME'):
                    print("JAVA_HOME is not set. Please set it manually.")
            else:
                raise EnvironmentError("Java is not installed locally")
                
        except (FileNotFoundError, subprocess.TimeoutExpired):
            # Running locally or in other environment
            print("Java is not installed. ")
            print("Please install Java manually:")
            print("   - Download JDK 8 from: https://adoptopenjdk.net/")
            print("   - Set JAVA_HOME environment variable")
            print("   - Add Java to your PATH")
            raise EnvironmentError(
                "Please install JDK 8 manually for local execution."
            )


def check_java_installation():
    """
    Verify Java installation and version
    
    Returns:
        dict: Installation status and version info
    """
    try:
        result = subprocess.run(
            ['java', '-version'], 
            capture_output=True, 
            text=True, 
            timeout=10
        )
        
        if result.returncode == 0:
            version_info = result.stderr.split('\n')[0]
            return {
                'installed': True,
                'version': version_info,
                'environment': 'Colab' if is_running_in_colab() else 'Local'
            }
        else:
            return {
                'installed': False,
                'error': 'Java command failed',
                'environment': 'Colab' if is_running_in_colab() else 'Local'
            }
            
    except FileNotFoundError:
        return {
            'installed': False,
            'error': 'Java not found in PATH',
            'environment': 'Colab' if is_running_in_colab() else 'Local'
        }
    except Exception as e:
        return {
            'installed': False,
            'error': str(e),
            'environment': 'Colab' if is_running_in_colab() else 'Local'
        }
    
def check_java_installation():
    """Check if Java is installed and get version"""
    try:
        result = subprocess.run(['java', '-version'], 
                              capture_output=True, 
                              text=True, 
                              timeout=10)
        
        if result.returncode == 0:
            version_line = result.stderr.split('\n')[0]
            return {'status': 'installed', 'version': version_line}
        else:
            return {'status': 'not installed'}
            
    except FileNotFoundError as e:
        return {'status': 'not installed', 'error': str(e)}
    except Exception as e:
        return {'status': 'not installed', 'error': str(e)}

def check_java_home():
    """Check JAVA_HOME environment variable"""
    java_home = os.environ.get('JAVA_HOME')
    if java_home:
        print(f"JAVA_HOME is set: {java_home}")
        
        # Check if java exists in JAVA_HOME
        java_path = os.path.join(java_home, 'bin', 'java')
        if os.path.exists(java_path):
            print("Java executable found in JAVA_HOME")
            return True
        else:
            print("Java executable NOT found in JAVA_HOME")
            return False
    else:
        print("JAVA_HOME is not set")
        return False

print("Detecting environment...")

# Check environment
if is_running_in_colab():
    print("Running in Google Colab")
else:
    print("Running locally")

# Setup Java environment
try:
    setup_java_environment()
    
    # Verify installation
    java_status = check_java_installation()
    if java_status['status']:
        print(f"Java is ready: {java_status['version']}")
    else:
        print(f"Java check failed: {java_status['error']}")
        
except EnvironmentError as e:
    print(f"Environment setup failed: {e}")
    print("Please setup Java manually and try again")
    raise Exception(e)

# Continue with your Stanford POS Tagger code
print("Proceeding with Stanford POS Tagger setup...")

Detecting environment...
Running locally
Java is ready: java version "1.8.0_202"
Proceeding with Stanford POS Tagger setup...


In [3]:
%pip install -r requirements.txt

# nltk punkt_lab
import nltk
nltk.download('punkt_tab')

# Standord POS
import os
import zipfile
import requests

standford_postagger_path = './content/drive/MyDrive/stanford-postagger-full'

if not os.path.exists(standford_postagger_path):
    print("Downloading Stanford POS Tagger...")
    url = "https://nlp.stanford.edu/software/stanford-postagger-full-2018-10-16.zip"
    response = requests.get(url)

    with open('./content/stanford-postagger-full.zip', 'wb') as f:
        f.write(response.content)

    with zipfile.ZipFile('./content/stanford-postagger-full.zip', 'r') as zip_ref:
        zip_ref.extractall('./content/drive/MyDrive/')

    os.rename('./content/drive/MyDrive/stanford-postagger-full-2018-10-16', standford_postagger_path)
    print("Done!")
else:
    print(f"Standford postagger already downloaed at {standford_postagger_path},\nTo download again, delete the download folder and run the code again.")

Note: you may need to restart the kernel to use updated packages.
Standford postagger already downloaed at ./content/drive/MyDrive/stanford-postagger-full, to download again, delete the download folder and run the code again.


You should consider upgrading via the 'c:\pyvm\anlp-py310\Scripts\python.exe -m pip install --upgrade pip' command.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\MK1349\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Step 01: Dataset

## download raw corpus

In [6]:
# DO NOT RUN, data is already provided.
from src.data_collection import DataCollector
from config.settings import DATA_COLLECTION_CONFIG

collector = DataCollector(base_dir=DATA_COLLECTION_CONFIG['base_dir'])

# print("start links collecting..")
# news_links = collector.collect_links_from_altibbi(
#     site_type="news",
#     max_pages=695, 
#     links_file="medical_news_links.txt"
# )

# article_links = collector.collect_links_from_altibbi(
#     site_type="articles",
#     max_pages=803, 
#     links_file="medical_articles_links.txt"
# )

print("start_loading...")
corpus = collector.download_articles_content(
    links_file=None,
    corpus_file="medical_corpus.csv"
)

stats = collector.get_corpus_stats("medical_corpus.csv")
print("statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

start_loading...


2025-11-01 13:22:22,346 - ERROR - Error reading links file: [Errno 2] No such file or directory: 'data/raw/None'


statistics:
  total_articles: 6572
  articles_with_headline: 6498
  articles_with_content: 6498
  total_words: 2742273.0
  average_article_length: 2522.6912896275776


## preprocess corpus

In [7]:
import pandas as pd

df = pd.read_csv("./data/raw/medical_corpus.csv")

from src.text_processor import *

# Apply processing to the dataframe
pdf = df.copy()

print("Original data shape:", pdf.shape)
print("Before cleaning - Sample texts:")
print(df['articleBody'].head())

# Apply cleaning using the pipeline function (more efficient)
pdf['articleBody'] = pdf['articleBody'].apply(clean_text_pipeline)

print("\nAfter cleaning - Sample texts:")
pd.set_option('display.max_colwidth', None)
print(pdf['articleBody'].head())

# Display statistics
print(f"\nCleaning Statistics:")
print(f"Total articles: {len(pdf)}")
print(f"Articles with content: {pdf['articleBody'].str.len().gt(0).sum()}")
print(f"Average article length: {pdf['articleBody'].str.len().mean():.0f} characters")

# Show some examples of before/after cleaning
print("\nBefore/After Comparison:")
for i in range(min(3, len(df))):
    print(f"\n--- Example {i+1} ---")
    print("BEFORE:", df['articleBody'].iloc[i][:200] + "..." if len(str(df['articleBody'].iloc[i])) > 200 else df['articleBody'].iloc[i])
    print("AFTER: ", pdf['articleBody'].iloc[i][:200] + "..." if len(str(pdf['articleBody'].iloc[i])) > 200 else pdf['articleBody'].iloc[i])
    print("-" * 50)

Original data shape: (6572, 3)
Before cleaning - Sample texts:
0    كشفت نتائج تجربة سريرية جديدة أن دواء جلوفادال...
1    كشفت أدلة جديدة، مستندة إلى بيانات أكثر من 15,...
2    دراسة رائدة كشفت أن مرضى السرطان الذين تلقوا ل...
3    في إنجاز طبي بارز، كشفت دراسة سريرية واسعة قاد...
4    حذّرت دراسة بريطانية واسعة من أن المشروبات الت...
Name: articleBody, dtype: object

After cleaning - Sample texts:
0                                                                      كشفت نتائج تجربة سريرية جديدة أن دواء جلوفادالين بالإنجليزية التجريبي من شركة أظهر فعالية وأماناً كبيرين لمرضى باركنسون المتقدم. والأهم من ذلك أن الدواء نجح في تحسين الأعراض الحركية دون التسبب في الآثار الشائعة مثل الهلوسة وهبوط الضغط التي تُصاحب العلاجات الحالية. فعالية الدواء الجديد للباركنسون أظهرت نتائج المرحلة الثانية من تجربة التي شملت أكثر من 200 مريض يعانون من تذبذبات في الأعراض الحركية بشكل يومي أن المرضى الذين تلقوا جلوفادالين عن طريق الفم بالإضافة إلى جانب العلاج التقليدي لمدة 10 أسابيع شهدوا انخفاضاً أكبر في 

In [None]:
import pandas as pd
import os

os.makedirs("data/processed", exist_ok=True)

try:
    pdf.to_json("data/processed/medical_corpus_processed.json", orient='records', force_ascii=False, indent=2)
    print("Data saved at: data/processed/medical_corpus_processed.json")
    
    
    print(f"{len(pdf)} were saved!")
    
except Exception as e:
    print(f"Error: {e}")

Data saved at: data/processed/medical_corpus_processed.json
6572 were saved!


# Step 02: Relations and Templates

In [None]:
import pandas as pd
import re
import os
import json
from typing import List, Dict, Tuple, Any
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt_tab')

class RelationExtractor:
    def __init__(self, base_dir: str = "data"):
        self.base_dir = base_dir
        self.setup_directories()
        self.tagger = ArabicPOSTagger()
        self.templates = self.load_templates()
        
    def setup_directories(self):
        """إنشاء المجلدات المطلوبة"""
        os.makedirs(f"{self.base_dir}/results", exist_ok=True)
        os.makedirs(f"{self.base_dir}/processed", exist_ok=True)
        
    def load_templates(self) -> Dict[str, List[str]]:
        
        templates = {
            "يعالج": [
                r"يستخدم\s+(\S+)\s+في\s+علاج\s+(\S+)",
                r"يعالج\s+(\S+)\s+مرض\s+(\S+)",
                r"يشفي\s+(\S+)\s+من\s+(\S+)",
                r"يداوي\s+(\S+)\s+حالة\s+(\S+)",
                r"يعالج\s+(\S+)\s+الاصابة\s+ب\s+(\S+)",
                r"يشفي\s+(\S+)\s+من\s+داء\s+(\S+)",
                r"يقضي\s+(\S+)\s+على\s+(\S+)",
                r"يزيل\s+(\S+)\s+اعراض\s+(\S+)",
                r"يخلص\s+(\S+)\s+من\s+(\S+)",
                r"يحارب\s+(\S+)\s+مرض\s+(\S+)",
                r"يواجه\s+(\S+)\s+المرض\s+(\S+)",
                r"يسيطر\s+(\S+)\s+على\s+(\S+)",
                r"يحد\s+(\S+)\s+من\s+انتشار\s+(\S+)",
                r"يوقف\s+(\S+)\s+تطور\s+(\S+)",
                r"يمنع\s+(\S+)\s+تفاقم\s+(\S+)",
                r"يخفف\s+(\S+)\s+من\s+(\S+)",
                r"يقلل\s+(\S+)\s+حدة\s+(\S+)",
                r"يحتوي\s+(\S+)\s+على\s+(\S+)",
                r"يستهدف\s+(\S+)\s+مرض\s+(\S+)",
                r"يعمل\s+(\S+)\s+ضد\s+(\S+)",
                r"(\S+)\s+دواء\s+ل\s+(\S+)",
                r"(\S+)\s+علاج\s+فعال\s+ل\s+(\S+)",
                r"(\S+)\s+من\s+ادوية\s+(\S+)",
                r"علاج\s+(\S+)\s+يكون\s+ب\s+(\S+)",
                r"(\S+)\s+من\s+العلاجات\s+الناجحة\s+ل\s+(\S+)",
                r"الدواء\s+(\S+)\s+مخصص\s+ل\s+(\S+)",
                r"العقار\s+(\S+)\s+مصمم\s+ل\s+(\S+)",
                r"العلاج\s+(\S+)\s+موجه\s+ل\s+(\S+)",
                r"(\S+)\s+من\s+الادوية\s+المضادة\s+ل\s+(\S+)",
                r"(\S+)\s+من\s+العلاجات\s+المتعارفة\s+ل\s+(\S+)",
                r"العلاج\s+ب\s+(\S+)\s+ل\s+(\S+)",
                r"استخدام\s+(\S+)\s+في\s+(\S+)",
                r"تناول\s+(\S+)\s+ل\s+(\S+)",
                r"وصفة\s+(\S+)\s+لمرضى\s+(\S+)",
                r"جرعة\s+(\S+)\s+لعلاج\s+(\S+)",
                r"مصل\s+(\S+)\s+ضد\s+(\S+)",
                r"مضاد\s+(\S+)\s+ل\s+(\S+)",
                r"حبوب\s+(\S+)\s+ل\s+(\S+)",
                r"كبسولات\s+(\S+)\s+ل\s+(\S+)",
                r"حقن\s+(\S+)\s+ل\s+(\S+)",
                r"(\S+)\s+فعال\s+ضد\s+(\S+)",
                r"(\S+)\s+ناجع\s+في\s+معالجة\s+(\S+)",
                r"(\S+)\s+مفيد\s+لمرضى\s+(\S+)",
                r"(\S+)\s+يحدث\s+تحسنا\s+في\s+(\S+)",
                r"(\S+)\s+يساعد\s+في\s+شفاء\s+(\S+)",
                r"(\S+)\s+يساهم\s+في\s+علاج\s+(\S+)",
                r"(\S+)\s+يعين\s+في\s+التغلب\s+على\s+(\S+)",
                r"(\S+)\s+يدعم\s+علاج\s+(\S+)",
                r"(\S+)\s+يشارك\s+في\s+معالجة\s+(\S+)",
                r"(\S+)\s+يسرع\s+شفاء\s+(\S+)",
                r"(\S+)\s+يحسن\s+حالة\s+(\S+)",
                r"(\S+)\s+ينعش\s+مرضى\s+(\S+)",
                r"(\S+)\s+ينشط\s+علاج\s+(\S+)",
                r"(\S+)\s+يفيد\s+في\s+(\S+)",
                r"(\S+)\s+يجدي\s+نفعا\s+في\s+(\S+)",
                r"(\S+)\s+يمنح\s+الشفاء\s+من\s+(\S+)",
                r"(\S+)\s+يضمن\s+علاج\s+(\S+)",
                r"(\S+)\s+يؤمن\s+الشفاء\s+من\s+(\S+)",
                r"(\S+)\s+يوفر\s+علاجا\s+ل\s+(\S+)",
                r"(\S+)\s+يحتوي\s+علاج\s+(\S+)",
                r"باستخدام\s+(\S+)\s+يمكن\s+الشفاء\s+من\s+(\S+)",
                r"بعد\s+تناول\s+(\S+)\s+تتحسن\s+حالة\s+(\S+)",
                r"يعطى\s+(\S+)\s+لمرضى\s+(\S+)",
                r"يوصف\s+(\S+)\s+لحالات\s+(\S+)",
                r"يؤخذ\s+(\S+)\s+لعلاج\s+(\S+)",
                r"يتم\s+وصف\s+(\S+)\s+ل\s+(\S+)",
                r"يقدم\s+(\S+)\s+للمصابين\s+ب\s+(\S+)",
                r"يجرى\s+علاج\s+(\S+)\s+ب\s+(\S+)",
                r"تحت\s+اسم\s+(\S+)\s+يعالج\s+(\S+)",
                r"ضمن\s+بروتوكول\s+(\S+)\s+لعلاج\s+(\S+)",
                r"في\s+اطار\s+علاج\s+(\S+)\s+يستخدم\s+(\S+)",
                r"خلال\s+معالجة\s+(\S+)\s+يعطى\s+(\S+)",
                r"عند\s+الاصابة\s+ب\s+(\S+)\s+يوصى\s+ب\s+(\S+)",
                r"مع\s+ظهور\s+اعراض\s+(\S+)\s+يعالج\s+ب\s+(\S+)",
                r"اثناء\s+علاج\s+(\S+)\s+يضاف\s+(\S+)",
                r"بعد\s+تشخيص\s+(\S+)\s+يعالج\s+ب\s+(\S+)",
                r"قبل\s+العمليات\s+يعطى\s+(\S+)\s+ل\s+(\S+)",
                r"بعد\s+الجراحة\s+يستخدم\s+(\S+)\s+ل\s+(\S+)",
                r"في\s+المستشفيات\s+يعالج\s+(\S+)\s+ب\s+(\S+)",
                r"في\s+العيادات\s+يوصف\s+(\S+)\s+ل\s+(\S+)",
                r"اذا\s+كنت\s+تعاني\s+من\s+(\S+)\s+فاستخدم\s+(\S+)",
                r"لمرضى\s+(\S+)\s+ينصح\s+ب\s+(\S+)",
                r"لعلاج\s+(\S+)\s+يستخدم\s+(\S+)",
                r"في\s+حالات\s+(\S+)\s+يوصى\s+ب\s+(\S+)",
                r"عندما\s+يصاب\s+المرء\s+ب\s+(\S+)\s+يعالج\s+ب\s+(\S+)",
                r"اذا\s+ظهر\s+(\S+)\s+فالعلاج\s+ب\s+(\S+)",
                r"في\s+حالة\s+(\S+)\s+العلاج\s+(\S+)",
                r"عند\s+وجود\s+(\S+)\s+يجب\s+استخدام\s+(\S+)",
                r"مع\s+تشخيص\s+(\S+)\s+يبدا\s+علاج\s+(\S+)",
                r"بمجرد\s+الاصابة\s+ب\s+(\S+)\s+يعطى\s+(\S+)",
                r"اذا\s+اشتبه\s+ب\s+(\S+)\s+يعالج\s+ب\s+(\S+)",
                r"عند\s+الشك\s+في\s+(\S+)\s+يوصف\s+(\S+)",
                r"في\s+الاشتباه\s+ب\s+(\S+)\s+يعالج\s+ب\s+(\S+)",
                r"اذا\s+كان\s+التشخيص\s+(\S+)\s+فالعلاج\s+(\S+)",
                r"عند\s+تاكيد\s+(\S+)\s+يبدا\s+(\S+)",
                r"بعد\s+تاكيد\s+الاصابة\s+ب\s+(\S+)\s+يعالج\s+ب\s+(\S+)",
                r"مع\s+ظهور\s+نتائج\s+(\S+)\s+يوصف\s+(\S+)",
                r"اذا\s+استمر\s+(\S+)\s+فالعلاج\s+(\S+)",
                r"عند\s+تفاقم\s+(\S+)\s+يستخدم\s+(\S+)",
                r"في\s+الحالات\s+المستعصية\s+من\s+(\S+)\s+يعالج\s+ب\s+(\S+)"
            ],

            "يتسبب": [
                r"يؤدي\s+(\S+)\s+الى\s+(\S+)",
                r"يتسبب\s+(\S+)\s+في\s+(\S+)",
                r"(\S+)\s+يؤدي\s+الى\s+حدوث\s+(\S+)",
                r"من\s+اثار\s+(\S+)\s+الاصابة\s+ب\s+(\S+)",
                r"يحدث\s+(\S+)\s+مرض\s+(\S+)",
                r"ينتج\s+عن\s+(\S+)\s+ظهور\s+(\S+)",
                r"(\S+)\s+من\s+مسببات\s+(\S+)",
                r"يعرض\s+(\S+)\s+للاصابة\s+ب\s+(\S+)",
                r"يحفز\s+(\S+)\s+ظهور\s+(\S+)",
                r"يولد\s+(\S+)\s+حالة\s+(\S+)",
                r"بسبب\s+(\S+)\s+يحدث\s+(\S+)",
                r"(\S+)\s+يؤدي\s+لتفاقم\s+(\S+)",
                r"يتسبب\s+(\S+)\s+في\s+تفشي\s+(\S+)",
                r"ينشا\s+(\S+)\s+عن\s+(\S+)",
                r"(\S+)\s+مصدر\s+ل\s+(\S+)",
                r"يورث\s+(\S+)\s+مرض\s+(\S+)",
                r"ينتج\s+(\S+)\s+عن\s+(\S+)",
                r"يسبب\s+(\S+)\s+مضاعفات\s+(\S+)",
                r"يعمل\s+(\S+)\s+على\s+احداث\s+(\S+)",
                r"يساهم\s+(\S+)\s+في\s+ظهور\s+(\S+)",
            ],

            "يقي": [
                r"يقي\s+(\S+)\s+من\s+(\S+)",
                r"يحمي\s+(\S+)\s+من\s+الاصابة\s+ب\s+(\S+)",
                r"يمنع\s+(\S+)\s+حدوث\s+(\S+)",
                r"يحصن\s+(\S+)\s+ضد\s+(\S+)",
                r"يقي\s+(\S+)\s+من\s+خطر\s+(\S+)",
                r"يبعد\s+(\S+)\s+شبح\s+(\S+)",
                r"يمنع\s+(\S+)\s+تطور\s+(\S+)",
                r"يحول\s+(\S+)\s+دون\s+(\S+)",
                r"يقي\s+(\S+)\s+من\s+مضاعفات\s+(\S+)",
                r"يقلل\s+(\S+)\s+من\s+احتمالية\s+(\S+)",
                r"يجنب\s+(\S+)\s+الاصابة\s+ب\s+(\S+)",
                r"يمنع\s+(\S+)\s+انتشار\s+(\S+)",
                r"يحافظ\s+(\S+)\s+على\s+الوقاية\s+من\s+(\S+)",
                r"يسهم\s+(\S+)\s+في\s+منع\s+(\S+)",
                r"يضاد\s+(\S+)\s+حدوث\s+(\S+)",
                r"يعيق\s+(\S+)\s+تقدم\s+(\S+)",
                r"يحبط\s+(\S+)\s+ظهور\s+(\S+)",
                r"يقاوم\s+(\S+)\s+نشوء\s+(\S+)",
                r"يشكل\s+(\S+)\s+حاجزا\s+ضد\s+(\S+)",
                r"يساعد\s+(\S+)\s+في\s+تجنب\s+(\S+)",
            ],

            "يزيد": [
                r"يزيد\s+(\S+)\s+من\s+(\S+)",
                r"يرفع\s+(\S+)\s+مستوى\s+(\S+)",
                r"يعزز\s+(\S+)\s+من\s+(\S+)",
                r"يضاعف\s+(\S+)\s+خطر\s+(\S+)",
                r"يكثف\s+(\S+)\s+من\s+(\S+)",
                r"يعظم\s+(\S+)\s+من\s+(\S+)",
                r"يضخم\s+(\S+)\s+تاثير\s+(\S+)",
                r"يسرع\s+(\S+)\s+من\s+(\S+)",
                r"يحفز\s+(\S+)\s+زيادة\s+(\S+)",
                r"يشجع\s+(\S+)\s+على\s+(\S+)",
                r"يعلي\s+(\S+)\s+من\s+(\S+)",
                r"يقوي\s+(\S+)\s+من\s+(\S+)",
                r"ينمي\s+(\S+)\s+(\S+)",
                r"يضاعف\s+(\S+)\s+فرص\s+(\S+)",
                r"يعزز\s+(\S+)\s+ظهور\s+(\S+)",
                r"يحرض\s+(\S+)\s+على\s+(\S+)",
                r"يسرع\s+(\S+)\s+وتيرة\s+(\S+)",
                r"يعظم\s+(\S+)\s+حجم\s+(\S+)",
                r"يكسب\s+(\S+)\s+قوة\s+(\S+)",
                r"يضيف\s+(\S+)\s+الى\s+(\S+)",
            ],

            "يقلل": [
                r"يقلل\s+(\S+)\s+من\s+(\S+)",
                r"يخفض\s+(\S+)\s+مستوى\s+(\S+)",
                r"يحد\s+(\S+)\s+من\s+(\S+)",
                r"يضعف\s+(\S+)\s+(\S+)",
                r"يخمد\s+(\S+)\s+(\S+)",
                r"يثبط\s+(\S+)\s+من\s+(\S+)",
                r"يقلص\s+(\S+)\s+حجم\s+(\S+)",
                r"يخفض\s+(\S+)\s+نسبة\s+(\S+)",
                r"يعيق\s+(\S+)\s+(\S+)",
                r"يبطئ\s+(\S+)\s+من\s+(\S+)",
                r"يخفف\s+(\S+)\s+من\s+(\S+)",
                r"يحد\s+(\S+)\s+من\s+انتشار\s+(\S+)",
                r"يضاد\s+(\S+)\s+تاثير\s+(\S+)",
                r"يخمد\s+(\S+)\s+اعراض\s+(\S+)",
                r"يقلل\s+(\S+)\s+حدة\s+(\S+)",
                r"يخفض\s+(\S+)\s+معدل\s+(\S+)",
                r"يضعف\s+(\S+)\s+شدة\s+(\S+)",
                r"يثبط\s+(\S+)\s+تقدم\s+(\S+)",
                r"يحجم\s+(\S+)\s+نطاق\s+(\S+)",
                r"يخمد\s+(\S+)\s+تفاقم\s+(\S+)",
            ],

            "يتفاعل": [
                r"يتفاعل\s+(\S+)\s+مع\s+(\S+)",
                r"يتاثر\s+(\S+)\s+و\s+(\S+)",
                r"يتداخل\s+(\S+)\s+مع\s+(\S+)",
                r"يؤثر\s+(\S+)\s+على\s+(\S+)",
                r"يتعارض\s+(\S+)\s+مع\s+(\S+)",
                r"يتكامل\s+(\S+)\s+مع\s+(\S+)",
                r"يتضاد\s+(\S+)\s+مع\s+(\S+)",
                r"يتناغم\s+(\S+)\s+مع\s+(\S+)",
                r"يتعارك\s+(\S+)\s+مع\s+(\S+)",
                r"يتلاقى\s+(\S+)\s+مع\s+(\S+)",
                r"يتاثر\s+(\S+)\s+مع\s+(\S+)",
                r"يتنافر\s+(\S+)\s+مع\s+(\S+)",
                r"يتزامن\s+(\S+)\s+مع\s+(\S+)",
                r"يتقاطع\s+(\S+)\s+مع\s+(\S+)",
                r"يتحد\s+(\S+)\s+مع\s+(\S+)",
                r"يندمج\s+(\S+)\s+مع\s+(\S+)",
                r"يتعارض\s+(\S+)\s+مع\s+مفعول\s+(\S+)",
                r"يتعاضد\s+(\S+)\s+مع\s+(\S+)",
                r"يتنافس\s+(\S+)\s+مع\s+(\S+)",
                r"يتالف\s+(\S+)\s+مع\s+(\S+)",
            ]
        }
    
        return templates
    
    def extract_noun_phrases(self, tagged_sentence: List[Tuple[str, str]]) -> List[str]:
        noun_phrases = []
        current_phrase = []
        
        for word, tag in tagged_sentence:
            if tag.startswith('NN'):
                current_phrase.append(word)
            elif tag.startswith('JJ'):
                current_phrase.append(word)
            elif tag.startswith('DT'):
                current_phrase.append(word)
            else:
                if current_phrase:
                    noun_phrases.append(' '.join(current_phrase))
                    current_phrase = []
        
        if current_phrase:
            noun_phrases.append(' '.join(current_phrase))
            
        return noun_phrases
    
    def is_medical_entity(self, entity: str, tagged_sentence: List[Tuple[str, str]]) -> bool:
        drug_indicators = ['دواء', 'عقار', 'علاج', 'مضاد', 'حبوب', 'كبسولة', 'حقنة']
        
        disease_indicators = ['مرض', 'داء', 'عدوى', 'التهاب', 'سرطان', 'سكري', 'ضغط']
        
        entity_lower = entity.lower()
        
        for indicator in drug_indicators + disease_indicators:
            if indicator in entity_lower:
                return True
        
        for word, tag in tagged_sentence:
            if word in entity:
                if tag in ['NN', 'NNP']:
                    return True
        
        return False
    
    def extract_relations_from_text(self, text: str) -> List[Dict[str, Any]]:
        relations = []
        
        try:
            tagged = self.tagger.tag_text(text)
            if isinstance(tagged, str):
                return relations
            
            noun_phrases = self.extract_noun_phrases(tagged)
            
            for relation_type, patterns in self.templates.items():
                for pattern in patterns:
                    matches = re.finditer(pattern, text)
                    
                    for match in matches:
                        if len(match.groups()) >= 2:
                            x_entity = match.group(1).strip()
                            y_entity = match.group(2).strip()
                            
                            if (self.is_medical_entity(x_entity, tagged) and 
                                self.is_medical_entity(y_entity, tagged)):
                                
                                relation = {
                                    'relation_type': relation_type,
                                    'x_entity': x_entity,
                                    'y_entity': y_entity,
                                    'pattern': pattern,
                                    'source_text': text[:100] + "..." if len(text) > 100 else text,
                                    'noun_phrases': noun_phrases,
                                }
                                
                                relations.append(relation)
        
        except Exception as e:
            print(f"ُError in relation extraction: {e}")
            
        return relations
    
    def process_dataframe(self, df: pd.DataFrame, text_column: str = 'articleBody') -> pd.DataFrame:
        all_relations = []
        
        print(f"Start extracting realtions from {len(df)} articles...")
        
        for idx, row in df.iterrows():
            text = row[text_column]
            
            if pd.notna(text) and text.strip():
                relations = self.extract_relations_from_text(str(text))
                
                for relation in relations:
                    relation['article_id'] = idx
                    relation['headline'] = row.get('headline', '')
                    relation['link'] = row.get('link', '')
                    all_relations.append(relation)
            
            if (idx + 1) % 100 == 0:
                print(f"article {idx + 1} processed...")
        
        results_df = pd.DataFrame(all_relations)
        
        print(f"{len(results_df)} relation extracted from articles")
        
        return results_df
    
    def save_results(self, results_df: pd.DataFrame, filename: str = "extracted_relations"):
        results_path = f"{self.base_dir}/results/{filename}.csv"
        results_df.to_csv(results_path, index=False, encoding='utf-8')
        print(f"results saved in: {results_path}")
        
        json_path = f"{self.base_dir}/results/{filename}.json"
        results_df.to_json(json_path, orient='records', force_ascii=False, indent=2)
        print(f"Results json copy {json_path}")
        
        stats = self.calculate_statistics(results_df)
        stats_path = f"{self.base_dir}/results/{filename}_stats.json"
        with open(stats_path, 'w', encoding='utf-8') as f:
            json.dump(stats, f, ensure_ascii=False, indent=2)
        print(f"Saving statistics {stats_path}")
        
        return results_path
    
    def calculate_statistics(self, results_df: pd.DataFrame) -> Dict[str, Any]:
        if results_df.empty:
            return {}
            
        stats = {
            'total_relations': len(results_df),
            'relations_by_type': results_df['relation_type'].value_counts().to_dict(),
            'average_confidence': results_df['confidence'].mean(),
            'articles_with_relations': results_df['article_id'].nunique(),
            'top_x_entities': results_df['x_entity'].value_counts().head(10).to_dict(),
            'top_y_entities': results_df['y_entity'].value_counts().head(10).to_dict(),
            'most_common_patterns': results_df['pattern'].value_counts().head(5).to_dict()
        }
        
        return stats

class ArabicPOSTagger:
    def __init__(self):
        self.model = './content/drive/MyDrive/stanford-postagger-full/models/arabic.tagger'
        self.jar = './content/drive/MyDrive/stanford-postagger-full/stanford-postagger.jar'
        self.tagger = None
        self.initialize_tagger()
    
    def initialize_tagger(self):
        try:
            self.tagger = StanfordPOSTagger(
                model_filename=self.model,
                path_to_jar=self.jar,
                encoding='utf-8'
            )
            print("Stanford POS Tagger initialized successfully")
        except Exception as e:
            print(f"Failed to initialize Stanford POS Tagger: {e}")
            self.tagger = None
    
    def tag_text(self, text: str):
        if self.tagger is None:
            return f"Error: Tagger not initialized"
        
        try:
            words = word_tokenize(text)
            
            tagged_words = self.tagger.tag(words)
            
            return tagged_words
            
        except Exception as e:
            return f"Error in POS tagging: {e}"

def main():
    try:
        processed_data_path = "./data/processed/medical_corpus_processed.json"
        df = pd.read_json(processed_data_path)
        print(f"{len(df)} articles were loaded.")
    except FileNotFoundError:
        print("Dataset not found!")
        return
    
    extractor = RelationExtractor()
    
    results_df = extractor.process_dataframe(df)
    
    if not results_df.empty:
        extractor.save_results(results_df)
        
        print("Sample of the extracted relations:")
        print(results_df[['relation_type', 'x_entity', 'y_entity', 'confidence']].head(10))
        
        stats = extractor.calculate_statistics(results_df)
        print(f"Statistics")
        print(f"   Total relations: {stats['total_relations']}")
        print(f"   # articls with relations: {stats['articles_with_relations']}")
        print(f"   relations distribution: {stats['relations_by_type']}")
        
    else:
        print("No relation were extracted")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\MK1349\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


6572 articles were loaded.
Stanford POS Tagger initialized successfully
Start extracting realtions from 6572 articles...
article 100 processed...
article 200 processed...
article 300 processed...
