In [1]:
import re
from bs4 import BeautifulSoup
from lxml import etree
import json
from collections import defaultdict
import os
import requests






In [2]:
# Test the function
def clean_text(text):
    """
    Clean text by removing special characters, extra spaces, and normalizing content

    Args:
        text (str): Input text to clean

    Returns:
        str: Cleaned text
    """
    if not isinstance(text, str):
        return ""

    # Remove HTML tags if present
    text = BeautifulSoup(text, "xml").get_text()

    # Replace newlines, tabs, and multiple spaces
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[\n\t\r]", " ", text)

    # Remove special characters but keep important punctuation
    text = re.sub(r"[^a-zA-Z0-9\s\.\,\?\!\-\']", "", text)

    # Remove extra spaces around punctuation
    text = re.sub(r"\s*([\.!?,])\s*", r"\1 ", text)

    # Remove multiple spaces and strip
    text = " ".join(text.split())

    return text.strip()



## New methods:

In [4]:
def extract_num_dot_examples2(text):
    """
    Extract numbered examples from text with robust pattern matching and error handling.
    
    Args:
        text (str): Input text containing numbered examples
        
    Returns:
        dict: Dictionary of examples with headings as keys and content as values,
              or None if no examples found or error occurs
    """
    if not isinstance(text, str) or not text.strip():
        return None
        
    def is_numbered_heading(text):
        """Check if text is a numbered heading."""
        patterns = [
            r'^\d+\.\s+',  # Standard numbers (1., 2., etc)
            r'^Example\s+\d+[\.:]\s+',  # Example prefixed numbers
            r'^\[\d+\]\s+',  # Bracketed numbers
            r'^[A-Z]?\d+[A-Z]?\.\s+'  # Alphanumeric (A1., 1a., etc)
        ]
        return any(re.match(pattern, text.strip()) for pattern in patterns)
    
    try:
        soup = BeautifulSoup(text, "xml")
        examples = {}
        current_heading = None
        current_text = []
        
        # Find all potential content elements
        elements = soup.find_all(["heading", "p", "text", "desc"])
        
        for element in elements:
            try:
                # Clean and normalize text
                text = clean_text(element.get_text())
                if not text:
                    continue
                
                if is_numbered_heading(text):
                    # Save previous section if exists
                    if current_heading and current_text:
                        content = " ".join(current_text)
                        if len(content) >= 10:  # Minimum content length
                            examples[current_heading] = content
                    
                    # Extract number from heading
                    number = re.search(r'\d+', text).group()
                    # Standardize heading format
                    current_heading = f"Example {number}: {text}"
                    current_text = []
                    
                elif current_heading:
                    # Filter out common unwanted content
                    if not any(skip in text.lower() for skip in ["figure", "table", "copyright"]):
                        current_text.append(text)
            
            except AttributeError:
                continue
        
        # Add final section if exists
        if current_heading and current_text:
            content = " ".join(current_text)
            if len(content) >= 10:
                examples[current_heading] = content
        
        # Post-process examples
        processed_examples = {}
        for heading, content in examples.items():
            # Remove duplicate content
            if content not in processed_examples.values():
                # Clean heading
                heading = re.sub(r'\s+', ' ', heading)
                heading = re.sub(r'Example\s+Example', 'Example', heading)
                processed_examples[heading] = content
        
        return processed_examples if processed_examples else None
        
    except Exception as e:
        print(f"Error extracting examples: {str(e)}")
        return None

def extract_examples_start_w_word2(siblings):
    """
    Extracts examples/experiments/tests sections that start with specific words from XML siblings.
    
    Args:
        siblings (list): List of sibling tags to process.
        
    Returns:
        list: List of dictionaries containing the extracted examples.
    """
    examples = []
    current_example = None
    in_example = False
    keywords = ('example', 'experiment', 'test', 'experimental')
    
    def clean_text(text):
        """Clean and normalize text content."""
        if not text:
            return ""
        return " ".join(text.strip().split())
    
    def is_example_heading(text):
        """Check if heading contains example keywords."""
        text = clean_text(text).lower()
        return any(text.startswith(keyword) for keyword in keywords)
    
    try:
        for idx, tag in enumerate(siblings):
            if not hasattr(tag, 'name'):  # Skip non-tag elements
                continue
                
            if tag.name == "heading":
                text = clean_text(tag.get_text())
                
                if is_example_heading(text):
                    in_example = True
                    
                    # Look ahead for title, but handle edge cases
                    title = ""
                    for i in range(idx + 1, min(idx + 3, len(siblings))):
                        next_tag = siblings[i]
                        if hasattr(next_tag, 'name'):
                            if next_tag.name == "heading":
                                title = clean_text(next_tag.get_text())
                                break
                            elif next_tag.name == "p":
                                title = clean_text(next_tag.get_text())[:100]  # Limit title length
                                break
                    
                    current_example = {
                        "number": text,
                        "title": title,
                        "content": [],
                        "raw_text": "",  # Store complete raw text
                    }
                    examples.append(current_example)
                
                elif in_example and not any(k in text.lower() for k in keywords):
                    # Only end example if we hit a non-example heading
                    in_example = False
            
            elif in_example and current_example is not None:
                # Collect content from various tag types
                if tag.name in ["p", "text", "desc"]:
                    content = clean_text(tag.get_text())
                    if content:  # Only add non-empty content
                        current_example["content"].append(content)
                        current_example["raw_text"] += f" {content}"
                
                # Handle nested content
                if hasattr(tag, 'children'):
                    for child in tag.children:
                        if hasattr(child, 'get_text'):
                            content = clean_text(child.get_text())
                            if content and content not in current_example["content"]:
                                current_example["content"].append(content)
        
        # Post-process examples
        for example in examples:
            # Remove duplicate content
            example["content"] = list(dict.fromkeys(example["content"]))
            # Clean up raw text
            example["raw_text"] = clean_text(example["raw_text"])
            # Validate content
            if not example["content"]:
                example["content"] = [example["raw_text"]]
        
        return [ex for ex in examples if ex["content"]]  # Return only examples with content
        
    except Exception as e:
        print(f"Error processing XML: {str(e)}")
        return []
    
def extract_examples_w_word2(text):
    """
    Extract examples/experiments/tests sections from patent text using keyword detection.
    
    Args:
        text (str): The patent text to extract examples from.
        
    Returns:
        list: List of dictionaries containing the extracted examples, or None if no examples found.
    """
    if not text or not isinstance(text, str):
        return None
        
    # Keywords to identify example sections
    KEYWORDS = ["example", "experiment", "test", "experimental"]
    # Tags that may contain relevant content
    CONTENT_TAGS = ["p", "text", "desc", "para"]
    
    def clean_text(text):
        """Clean and normalize text content."""
        if not text:
            return ""
        return " ".join(text.strip().split())
    
    def is_example_heading(tag):
        """Check if tag is a heading containing example keywords."""
        if not tag or not hasattr(tag, 'name') or tag.name != "heading":
            return False
        text = clean_text(tag.get_text()).lower()
        return any(keyword in text for keyword in KEYWORDS)
    
    try:
        soup = BeautifulSoup(text, "xml")
        examples = []
        
        # Find all example headings
        example_headings = soup.findAll(is_example_heading)
        
        for heading in example_headings:
            current_content = []
            next_sibling = heading.find_next_sibling()
            
            # Get title with fallback options
            title = ""
            if next_sibling:
                if next_sibling.name == "heading":
                    title = clean_text(next_sibling.get_text())
                elif next_sibling.name in CONTENT_TAGS:
                    # Use first paragraph as title if no heading found
                    title = clean_text(next_sibling.get_text())[:100]
            
            # Collect content until next example heading or section end
            sibling = next_sibling
            while sibling:
                if is_example_heading(sibling):
                    break
                    
                if sibling.name in CONTENT_TAGS:
                    content = clean_text(sibling.get_text())
                    if content:
                        current_content.append(content)
                        
                # Handle nested content
                if hasattr(sibling, 'children'):
                    for child in sibling.children:
                        if hasattr(child, 'name') and child.name in CONTENT_TAGS:
                            content = clean_text(child.get_text())
                            if content and content not in current_content:
                                current_content.append(content)
                
                sibling = sibling.find_next_sibling()
            
            # Only add example if it has content
            if current_content:
                example = {
                    "number": clean_text(heading.get_text()),
                    "title": title,
                    "content": current_content,
                    "raw_text": " ".join(current_content)
                }
                examples.append(example)
        
        # Post-process examples
        if examples:
            for example in examples:
                # Remove duplicate content
                example["content"] = list(dict.fromkeys(example["content"]))
                # Ensure content is not empty
                if not example["content"]:
                    example["content"] = [example["raw_text"]]
            
            return [ex for ex in examples if ex["content"]]
            
        return None
        
    except Exception as e:
        print(f"Error processing patent text: {str(e)}")
        return None
    
def process_siblings2(siblings):
    """
    Process XML siblings to extract examples/experiments/tests sections.
    
    Args:
        siblings (list): List of sibling tags to process
        
    Returns:
        list: List of dictionaries containing examples, or None if no examples found
    """
    if not siblings:
        return None
        
    KEYWORDS = ["example", "experiment", "test", "experimental"]
    examples = []
    
    def clean_text(text):
        """Clean and normalize text."""
        if not text:
            return ""
        return " ".join(text.strip().split())
    
    def is_example_heading(tag):
        """Check if tag is an example heading."""
        try:
            return (tag.name == "heading" and 
                   any(keyword in clean_text(tag.text).lower() 
                       for keyword in KEYWORDS))
        except AttributeError:
            return False
    
    try:
        # Find all matching headings
        example_headings = [tag for tag in siblings if is_example_heading(tag)]
        
        for heading in example_headings:
            current_content = []
            try:
                idx = siblings.index(heading)
            except ValueError:
                continue
                
            # Get title with fallback options
            title = ""
            if idx + 1 < len(siblings):
                next_tag = siblings[idx + 1]
                if hasattr(next_tag, 'name'):
                    if next_tag.name == "heading":
                        title = clean_text(next_tag.text)
                    elif next_tag.name == "p":
                        title = clean_text(next_tag.text)[:100]
            
            # Collect content until next example heading
            i = idx + 1
            while i < len(siblings):
                current_tag = siblings[i]
                
                if is_example_heading(current_tag):
                    break
                    
                if hasattr(current_tag, 'name'):
                    if current_tag.name == "p":
                        content = clean_text(current_tag.text)
                        if content:
                            current_content.append(content)
                    
                    # Handle nested content
                    if hasattr(current_tag, 'children'):
                        for child in current_tag.children:
                            if hasattr(child, 'name') and child.name == "p":
                                content = clean_text(child.text)
                                if content and content not in current_content:
                                    current_content.append(content)
                i += 1
            
            if current_content:
                example = {
                    "number": clean_text(heading.text),
                    "title": title,
                    "content": list(dict.fromkeys(current_content)),
                    "raw_text": " ".join(current_content)
                }
                examples.append(example)
                
        return examples if examples else None
        
    except Exception as e:
        print(f"Error processing siblings: {str(e)}")
        return None
    

def extract_all_examples2(text):
    """
    Extracts all numbered examples along with their descriptions from text.
    
    Args:
        text (str): Input text containing numbered examples
        
    Returns:
        dict: Dictionary of examples with titles as keys and descriptions as values,
              or None if no examples found or error occurs
    """
    if not isinstance(text, str) or not text.strip():
        return None
        
    def clean_example_text(text):
        """Clean and normalize example text."""
        if not text:
            return ""
        # Remove excessive whitespace and normalize line endings
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep essential punctuation
        text = re.sub(r'[^\w\s.,;:()\-\'\"]+', '', text)
        return text.strip()
    
    try:
        # More flexible pattern matching for different numbering styles
        patterns = [
            # Standard decimal numbering (1., 2., etc)
            r'(?:Example\s+)?(\d+\.?\s+[^\n]+)[\n\r\s]+([^]*?)(?=(?:\n\s*Example\s+)?\d+\.|\Z)',
            
            # Letter/number combinations (A1., 1a., etc)
            r'(?:Example\s+)?([A-Za-z]?\d+[A-Za-z]?\.?\s+[^\n]+)[\n\r\s]+([^]*?)(?=(?:\n\s*Example\s+)?[A-Za-z]?\d+[A-Za-z]?\.|\Z)',
            
            # Roman numerals (I., II., etc)
            r'(?:Example\s+)?([IVX]+\.?\s+[^\n]+)[\n\r\s]+([^]*?)(?=(?:\n\s*Example\s+)?[IVX]+\.|\Z)'
        ]
        
        examples = {}
        
        # Try each pattern
        for pattern in patterns:
            matches = re.finditer(pattern, text, re.MULTILINE | re.DOTALL)
            
            for match in matches:
                try:
                    title = clean_example_text(match.group(1))
                    description = clean_example_text(match.group(2))
                    
                    # Validate extracted content
                    if title and description:
                        # Add "Example" prefix if not present
                        if not title.lower().startswith('example'):
                            title = f"Example {title}"
                            
                        # Store unique examples only
                        if title not in examples:
                            examples[title] = description
                except IndexError:
                    continue
        
        # Post-process examples
        processed_examples = {}
        for title, desc in examples.items():
            # Remove duplicate descriptions
            if desc not in processed_examples.values():
                # Ensure minimum content length
                if len(desc) >= 10:  # Arbitrary minimum length
                    processed_examples[title] = desc
        
        return processed_examples if processed_examples else None
        
    except Exception as e:
        print(f"Error extracting examples: {str(e)}")
        return None
    
def extract_experiments_w_heading2(text):
    """
    Extract Examples/Experiments sections from patent text with robust heading detection.
    
    Args:
        text (str): Input patent text
        
    Returns:
        dict: Dictionary containing section headings and their content,
              or None if no sections found or error occurs
    """
    if not isinstance(text, str) or not text.strip():
        return None
        
    # Section keywords to match (case-insensitive)
    SECTION_KEYWORDS = {
        'examples', 'example', 'experiments', 'experiment',
        'tests', 'test', 'experimental', 'exemplary'
    }
    
    def is_section_heading(tag):
        """Check if tag is a valid section heading."""
        try:
            if tag.name != "heading":
                return False
            
            text = clean_text(tag.get_text()).lower()
            # Check for exact matches and variations
            return (any(keyword == text for keyword in SECTION_KEYWORDS) or
                   any(f"section {keyword}" in text for keyword in SECTION_KEYWORDS) or
                   any(f"{keyword}s section" in text for keyword in SECTION_KEYWORDS))
        except AttributeError:
            return False
    
    def extract_section_content(heading):
        """Extract content following a section heading."""
        content = []
        current = heading.find_next_sibling()
        
        while current and not is_section_heading(current):
            if current.name in ['p', 'text', 'desc']:
                text = clean_text(current.get_text())
                if text and len(text) >= 10:  # Minimum content length
                    content.append(text)
            
            # Handle nested content
            if hasattr(current, 'children'):
                for child in current.children:
                    if hasattr(child, 'name') and child.name in ['p', 'text', 'desc']:
                        text = clean_text(child.get_text())
                        if text and len(text) >= 10 and text not in content:
                            content.append(text)
                            
            current = current.find_next_sibling()
            
        return content
    
    try:
        soup = BeautifulSoup(text, "xml")
        sections = {}
        
        # Find all matching section headings
        examples_headings = soup.find_all(is_section_heading)
        
        if not examples_headings:
            return None
            
        # Process each section
        for heading in examples_headings:
            heading_text = clean_text(heading.get_text())
            
            # Standardize heading format
            if not any(keyword in heading_text.lower() for keyword in SECTION_KEYWORDS):
                heading_text = f"Examples: {heading_text}"
                
            # Extract and validate content
            content = extract_section_content(heading)
            
            if content:  # Only include sections with content
                sections[heading_text] = {
                    'content': content,
                    'raw_text': ' '.join(content),
                    'position': len(sections) + 1
                }
        
        # Post-process sections
        processed_sections = {}
        for heading, data in sections.items():
            # Remove duplicate content
            if data['raw_text'] not in [d['raw_text'] for d in processed_sections.values()]:
                processed_sections[heading] = data
        
        return processed_sections if processed_sections else None
        
    except Exception as e:
        print(f"Error extracting experiment sections: {str(e)}")
        return None

In [5]:
def extract_examples_from_patents(xml_list):
    """
    Process multiple patent XMLs to extract examples and their content.
    
    Args:
        xml_list (list): List of XML strings containing patent documents
        
    Returns:
        dict: Dictionary with document numbers as keys and their examples as values
    """
    def find_doc_number(xml):
        """Extract document number from patent XML."""
        try:
            soup = BeautifulSoup(xml, "xml")
            doc_num = soup.find("doc-number")
            return clean_text(doc_num.text) if doc_num else "UNKNOWN"
        except Exception:
            return "UNKNOWN"
    
    def get_best_examples(xml):
        """
        Try different example extraction methods and return the best result.
        Returns dict of {heading: content}
        """
        results = {}
        
        # Try each extraction method
        try:
            # Method 1: Extract examples with section headings
            section_headings = extract_experiments_w_heading2(xml)
            if section_headings:
                for heading in section_headings:
                    content = []
                    current = heading.find_next_sibling()
                    while current and current.name != "heading":
                        if current.name == "p":
                            content.append(clean_text(current.text))
                        current = current.find_next_sibling()
                    if content:
                        results[clean_text(heading.text)] = content
            
            # If no results, try numbered examples
            if not results:
                numbered = extract_num_dot_examples2(xml)
                if numbered:
                    results = numbered
            
            # If still no results, try general example extraction
            if not results:
                examples = extract_examples_w_word2(xml)
                if examples:
                    for ex in examples:
                        if isinstance(ex, dict):
                            title = ex.get('number', '')
                            content = ex.get('content', [])
                            if title and content:
                                results[title] = content
        
        except Exception as e:
            print(f"Error in example extraction: {str(e)}")
        
        return results
    
    patent_examples = {}
    
    # Process each XML document
    for i, xml in enumerate(xml_list):
        try:
            doc_number = find_doc_number(xml)
            if not doc_number:
                continue
                
            examples = get_best_examples(xml)
            
            if examples:
                # Store results with metadata
                patent_examples[doc_number] = {
                    'examples': examples,
                    'example_count': len(examples),
                    'extraction_method': 'combined',
                    'has_content': any(bool(content) for content in examples.values())
                }
                
        except Exception as e:
            print(f"Error processing document {i}: {str(e)}")
            continue
    
    return patent_examples

# Example usage:
def process_patent_examples(xml_list):
    """
    Process patents and print summary statistics.
    """
    results = extract_examples_from_patents(xml_list)
    
    # Print summary
    print(f"Processed {len(xml_list)} patent documents")
    print(f"Found examples in {len(results)} documents")
    
    # Document-level statistics
    docs_with_examples = sum(1 for doc in results.values() if doc['has_content'])
    total_examples = sum(doc['example_count'] for doc in results.values())
    
    print(f"Documents with valid examples: {docs_with_examples}")
    print(f"Total examples extracted: {total_examples}")
    
    return results


In [6]:
from utilities.utils_clean import load_from_pickle,find_doc_number
from utilities.test_dataset_utils import remove_leadiong_zeros
import pandas as pd
test_data = load_from_pickle("../data/test_dataset_2015.pkl")

df_test_data = pd.DataFrame(test_data,index=["xml"],).T.reset_index()
df_test_data.columns = ["patentnumber","xml"] 

df = pd.read_csv("../data/freilichdataet_2015.csv")
df["patentnumber"] = df["patentnumber"].astype(str).transform(lambda x: x.replace(".0", ""))
df["patentnumber"] = df["patentnumber"].apply(remove_leadiong_zeros)
merged = df_test_data.merge(df, on="patentnumber", how="left")
print(len(merged))
cols = ["patentnumber","xml","prophetic","nonprophetic","allprophetic","someprophetic"]
merged[cols].head()

Loaded 25081 patents from ../data/test_dataset_2015.pkl
25081


Unnamed: 0,patentnumber,xml,prophetic,nonprophetic,allprophetic,someprophetic
0,RE045323,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",1.0,12.0,0.0,1.0
1,RE045324,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",0.0,8.0,0.0,0.0
2,RE045325,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",0.0,8.0,0.0,0.0
3,8925349,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",26.0,0.0,1.0,0.0
4,8925551,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",6.0,5.0,0.0,1.0


In [None]:
class ExampleExtractor:
    """Class to handle example extraction from patent XML documents"""
    
    def __init__(self):
        self.soup = None

    def validate_content(self, examples):
        """Check if examples contain valid content"""
        if not examples:
            return False
        if isinstance(examples, list):
            return bool(examples and examples[0].get("content") 
                      and len(examples[0]["content"]) > 0)
        return bool(examples.get("content") and len(examples["content"]) > 0)

    def extract_from_heading(self, heading) -> list:
        """Extract examples from heading following specific order of methods"""
        extracted_examples = []
        
        # Try process_siblings first
        example_start_w = process_siblings2(heading.find_next_siblings())
        if example_start_w:
            if not example_start_w[0]["content"]:
                # If no content, try extract_examples_start_w_word
                extracted_ex_w_word = extract_examples_start_w_word2(heading.find_next_siblings())
                if extracted_ex_w_word:
                    if isinstance(extracted_ex_w_word, list):
                        if self.validate_content(extracted_ex_w_word):
                            extracted_examples.append(extracted_ex_w_word)
                    elif self.validate_content(extracted_ex_w_word):
                        extracted_examples.append(extracted_ex_w_word)
            else:
                if self.validate_content(example_start_w):
                    extracted_examples.append(example_start_w)
        else:
            # If process_siblings failed, try extract_examples_start_w_word
            extracted_ex_w_word = extract_examples_start_w_word2(heading.find_next_siblings())
            if extracted_ex_w_word:
                if isinstance(extracted_ex_w_word, list):
                    if self.validate_content(extracted_ex_w_word):
                        extracted_examples.append(extracted_ex_w_word)
                elif self.validate_content(extracted_ex_w_word):
                    extracted_examples.append(extracted_ex_w_word)
            else:
                # Last resort: try extract_num_dot_examples
                num_dot_examples = extract_num_dot_examples2(str(heading.find_next_siblings()))
                if num_dot_examples and self.validate_content(num_dot_examples):
                    extracted_examples.append(num_dot_examples)
        
        return extracted_examples

    def extract_examples(self, xml: str) -> list:
        """Extract examples from XML following specific order of methods"""
        try:
            # First check for experiments section
            heading = extract_experiments_w_heading2(xml)
            
            if heading:
                if len(heading) > 1:
                    # Handle multiple headings
                    all_examples = []
                    for h in heading:
                        examples = self.extract_from_heading(h)
                        if examples:
                            all_examples.extend(examples)
                    return all_examples if all_examples else []
                    
                elif len(heading) == 1:
                    # Handle single heading
                    return self.extract_from_heading(heading[0])
                    
                else:
                    # Try other methods in order
                    extracted_ex_w_word = extract_examples_w_word2(xml)
                    if extracted_ex_w_word:
                        if isinstance(extracted_ex_w_word, list):
                            if self.validate_content(extracted_ex_w_word):
                                return [extracted_ex_w_word]
                        elif self.validate_content(extracted_ex_w_word):
                            return [extracted_ex_w_word]
                            
                    # Try process_siblings if previous failed
                    example_start_w = process_siblings2(heading[0].find_next_siblings())
                    if example_start_w and self.validate_content(example_start_w):
                        return [example_start_w]
            else:
                # If no experiments section, try methods in order
                example_start_w = extract_examples_w_word2(xml)
                if example_start_w and self.validate_content(example_start_w):
                    return [example_start_w]
                    
                num_dot_examples = extract_num_dot_examples2(xml)
                if num_dot_examples and self.validate_content(num_dot_examples):
                    return [num_dot_examples]
                    
            return []
            
        except Exception as e:
            logging.error(f"Error extracting examples: {str(e)}")
            return []

# Usage
extractor = ExampleExtractor()
doc_w_exp = {}

for i, xml in enumerate(merged["xml"], start=1):
    if i % 1000 == 0:
        print(f"{i}/{len(merged.xml)} so far found {len(doc_w_exp)} docs with experiments")
    
    extracted_examples = extractor.extract_examples(xml)
    if extracted_examples:
        doc_w_exp[find_doc_number(xml)[0]] = extracted_examples

  k = self.parse_starttag(i)
  text = BeautifulSoup(text, "xml").get_text()


Processed 0/2000 - Found 0 docs with experiments
Processed 0/2000 - Found 0 docs with experiments
Processed 0/2000 - Found 0 docs with experiments
Processed 0/2000 - Found 0 docs with experiments


In [10]:
class ExampleExtractor:
    """Class to handle example extraction from patent XML documents"""
    
    def __init__(self):
        self.soup = None

    def validate_content(self, examples):
        """Check if examples contain valid content"""
        if not examples:
            return False
        if isinstance(examples, list):
            return bool(examples and examples[0].get("content") 
                      and len(examples[0]["content"]) > 0)
        return bool(examples.get("content") and len(examples["content"]) > 0)

    def extract_from_heading(self, heading) -> list:
        """Extract examples from heading following specific order of methods"""
        extracted_examples = []
        
        # Try process_siblings first
        example_start_w = process_siblings2(heading.find_next_siblings())
        if example_start_w:
            if not example_start_w[0]["content"]:
                # If no content, try extract_examples_start_w_word
                extracted_ex_w_word = extract_examples_start_w_word2(heading.find_next_siblings())
                if extracted_ex_w_word:
                    if isinstance(extracted_ex_w_word, list):
                        if self.validate_content(extracted_ex_w_word):
                            extracted_examples.append(extracted_ex_w_word)
                    elif self.validate_content(extracted_ex_w_word):
                        extracted_examples.append(extracted_ex_w_word)
            else:
                if self.validate_content(example_start_w):
                    extracted_examples.append(example_start_w)
        else:
            # If process_siblings failed, try extract_examples_start_w_word
            extracted_ex_w_word = extract_examples_start_w_word2(heading.find_next_siblings())
            if extracted_ex_w_word:
                if isinstance(extracted_ex_w_word, list):
                    if self.validate_content(extracted_ex_w_word):
                        extracted_examples.append(extracted_ex_w_word)
                elif self.validate_content(extracted_ex_w_word):
                    extracted_examples.append(extracted_ex_w_word)
            else:
                # Last resort: try extract_num_dot_examples
                num_dot_examples = extract_num_dot_examples2(str(heading.find_next_siblings()))
                if num_dot_examples and self.validate_content(num_dot_examples):
                    extracted_examples.append(num_dot_examples)
        
        return extracted_examples

    def extract_examples(self, xml: str) -> list:
        """Extract examples from XML following specific order of methods"""
        try:
            # First check for experiments section
            heading = extract_experiments_w_heading2(xml)
            
            if heading:
                if len(heading) > 1:
                    # Handle multiple headings
                    all_examples = []
                    for h in heading:
                        examples = self.extract_from_heading(h)
                        if examples:
                            all_examples.extend(examples)
                    return all_examples if all_examples else []
                    
                elif len(heading) == 1:
                    # Handle single heading
                    return self.extract_from_heading(heading[0])
                    
                else:
                    # Try other methods in order
                    extracted_ex_w_word = extract_examples_w_word2(xml)
                    if extracted_ex_w_word:
                        if isinstance(extracted_ex_w_word, list):
                            if self.validate_content(extracted_ex_w_word):
                                return [extracted_ex_w_word]
                        elif self.validate_content(extracted_ex_w_word):
                            return [extracted_ex_w_word]
                            
                    # Try process_siblings if previous failed
                    example_start_w = process_siblings2(heading[0].find_next_siblings())
                    if example_start_w and self.validate_content(example_start_w):
                        return [example_start_w]
            else:
                # If no experiments section, try methods in order
                example_start_w = extract_examples_w_word2(xml)
                if example_start_w and self.validate_content(example_start_w):
                    return [example_start_w]
                    
                num_dot_examples = extract_num_dot_examples2(xml)
                if num_dot_examples and self.validate_content(num_dot_examples):
                    return [num_dot_examples]
                    
            return []
            
        except Exception as e:
            logging.error(f"Error extracting examples: {str(e)}")
            return []

# Usage
extractor = ExampleExtractor()
doc_w_exp = {}

for i, xml in enumerate(merged["xml"][:2000], start=1):
    if i % 1000 == 0:
        print(f"{i}/{len(merged.xml)} so far found {len(doc_w_exp)} docs with experiments")
    
    extracted_examples = extractor.extract_examples(xml)
    if extracted_examples:
        doc_w_exp[find_doc_number(xml)[0]] = extracted_examples

  text = BeautifulSoup(text, "xml").get_text()


1000/25081 so far found 798 docs with experiments
2000/25081 so far found 1560 docs with experiments


In [21]:
def dic_to_dic_w_tense(doc_w_exp):
    """
    Convert document examples dictionary to tense analysis dictionary.
    
    Args:
        doc_w_exp (dict): Dictionary with doc numbers as keys and examples as values
        
    Returns:
        dict: Dictionary with doc numbers as keys and tense counts as values
    """
    dic = {}
    pattern = r'\(\d+\)\s*([A-Za-z0-9\-\(\)\{\},:;=\[\]\+\*\s\.\^\$\%]+(?:\.(?:sup|delta|Hz|NMR)[^\)]*)?)'
    
    for doc_num, examples_data in doc_w_exp.items():
        tense_counts = {"past": 0, "present": 0, "Unknown": 0}
        
        try:
            # Handle different example formats
            examples = examples_data.get('examples', examples_data)
            
            # Process each example
            for example in examples:
                if isinstance(example, dict):
                    # Single example dict format
                    content = []
                    if 'content' in example:
                        content = example['content']
                    elif 'title' in example:
                        content = [example['title']] + example.get('content', [])
                    
                    desc = " ".join(content) if isinstance(content, list) else str(content)
                    
                elif isinstance(example, list):
                    # List of examples format
                    for ex in example:
                        if isinstance(ex, dict):
                            content = ex.get('content', [])
                            title = ex.get('title', '')
                            desc = f"{title}. {' '.join(content)}"
                        else:
                            continue
                else:
                    continue
                
                # Analyze tense
                tense = check_tense_nltk_updated(clean_text(desc))
                if tense != "Unknown":
                    tense_counts[tense] += 1
                else:
                    # Check for technical patterns indicating past tense
                    matches = re.findall(pattern, desc)
                    if matches:
                        tense_counts["past"] += 1
                    else:
                        tense_counts["Unknown"] += 1
        
        except Exception as e:
            print(f"Error processing document {doc_num}: {str(e)}")
            continue
        
        # Only add documents with valid tense analysis
        if sum(tense_counts.values()) > 0:
            dic[doc_num] = tense_counts
    
    return dic



In [41]:
def dic_to_dic_w_tense(doc_w_exp):
    """
    Convert document examples dictionary to tense analysis dictionary.
    
    Args:
        doc_w_exp (dict): Dictionary with doc numbers as keys and examples as values
        
    Returns:
        dict: Dictionary with doc numbers and their tense counts
    """
    dic = {}
    pattern = r'\(\d+\)\s*([A-Za-z0-9\-\(\)\{\},:;=\[\]\+\*\s\.\^\$\%]+(?:\.(?:sup|delta|Hz|NMR)[^\)]*)?)'
    
    def process_example_content(ex):
        """Helper to extract content from an example"""
        if isinstance(ex, dict):
            title = ex.get('title', '') or ex.get('number', '')
            content = ex.get('content', [])
            if isinstance(content, list):
                return f"{title}. {' '.join(content)}"
            return f"{title}. {content}"
        return ""
    
    for doc_num, examples_data in doc_w_exp.items():
        tense_counts = {"past": 0, "present": 0, "Unknown": 0}
        
        try:
            # Handle list format directly
            examples = examples_data if isinstance(examples_data, list) else [examples_data]
            
            for example_group in examples:
                if isinstance(example_group, list):
                    # Handle nested list of examples
                    for ex in example_group:
                        desc = process_example_content(ex)
                        if desc:
                            tense = check_tense_nltk_updated(desc)
                            if tense != "Unknown":
                                tense_counts[tense] += 1
                            else:
                                if re.findall(pattern, desc):
                                    tense_counts["past"] += 1
                                else:
                                    tense_counts["Unknown"] += 1
                                    #print(desc)
                
                elif isinstance(example_group, dict):
                    # Handle single example dictionary
                    desc = process_example_content(example_group)
                    if desc:
                        tense = check_tense_nltk_updated(desc)
                        if tense != "Unknown":
                            tense_counts[tense] += 1
                        else:
                            if re.findall(pattern, desc):
                                tense_counts["past"] += 1
                            else:
                                tense_counts["Unknown"] += 1
                                #print(desc)
        
        except Exception as e:
            print(f"Error processing document {doc_num}: {str(e)}")
            continue
        
        # Add document if we found any examples
        if sum(tense_counts.values()) > 0:
            dic[doc_num] = tense_counts
    
    return dic

# Usage
final = dic_to_dic_w_tense(doc_w_exp)

# Print summary
total_docs = len(final)
total_examples = sum(sum(counts.values()) for counts in final.values())
print(f"\nProcessed {total_docs} documents")
print(f"Found {total_examples} total examples")

for tense in ["past", "present", "Unknown"]:
    count = sum(counts[tense] for counts in final.values())
    pct = (count / total_examples * 100) if total_examples > 0 else 0
    print(f"{tense}: {count} ({pct:.1f}%)")


Processed 1560 documents
Found 27433 total examples
past: 21030 (76.7%)
present: 5819 (21.2%)
Unknown: 584 (2.1%)


In [42]:
df_final = pd.DataFrame(final).T.reset_index()
df_final.columns = ["patentnumber","past","present","Unknown"]
df_final["patentnumber"] = df_final["patentnumber"].apply(remove_leadiong_zeros)

In [45]:
doc_w_exp["08937339"]

[[{'number': 'Example 1',
   'title': 'The effects of the growth of a Si(1-v-w-x)CwAlxNv layer on a Si substrate was studied in the present',
   'content': ['The effects of the growth of a Si(1-v-w-x)CwAlxNv layer on a Si substrate was studied in the present example.'],
   'raw_text': 'The effects of the growth of a Si(1-v-w-x)CwAlxNv layer on a Si substrate was studied in the present example.'},
  {'number': 'Working Example 1',
   'title': 'In Working Example 1, a Si(1-v-w-x)CwAlxNv substrate 10a was basically manufactured by the method fo',
   'content': ['In Working Example 1, a Si(1-v-w-x)CwAlxNv substrate 10a was basically manufactured by the method for manufacturing a Si(1-v-w-x)CwAlxN, substrate 10a according to the first embodiment with a PLD apparatus illustrated in FIG. 6. Si0.05C0.05(AlN)0.9 wherein v+x was 0.9 was produced as a Si(1-v-w-x)CwAlxNv layer 12.',
    'More specifically, the raw material 103 for a Si0.05C0.05(AlN)0.9 layer 12 was first prepared. The raw material

In [43]:
import numpy as np

df_check = df_final.merge(merged,on= "patentnumber",how="left")[["patentnumber","present","past","Unknown","prophetic","nonprophetic","allprophetic","someprophetic"]]
#df_check["past"] = df_check["past"] + df_check["Unknown"]
df_check["Total_Extracted"] = df_check["past"] + df_check["present"]  #+ df_check["Unknown"]
df_check["Total_Freilich"] = df_check["prophetic"] + df_check["nonprophetic"]
df_check["prophetic_error"] = np.sqrt((df_check["prophetic"] - df_check["present"])**2)
df_check["nonprophetic_error"] = np.sqrt((df_check["nonprophetic"] - df_check["past"])**2)
df_check["Total_Mean_error"] = np.sqrt((df_check["Total_Freilich"] - df_check["Total_Extracted"])**2)
df_check["Sum_error"] = df_check["prophetic_error"] + df_check["nonprophetic_error"] + df_check["Total_Mean_error"]
print(f"Number of exact matches: {len(df_check[(df_check.prophetic_error ==0) & (df_check.nonprophetic_error ==0) & (df_check.Total_Mean_error ==0)])} out of {len(df_check)} , Percentage: {len(df_check[(df_check.prophetic_error ==0) & (df_check.nonprophetic_error ==0) & (df_check.Total_Mean_error ==0)])/len(df_check)*100}")
print(f"Number of exact num of patent extracted: {len(df_check[(df_check.Total_Extracted == df_check.Total_Freilich)])} out of {len(df_check)}")
print(f"Avg Total error: {df_check['Total_Mean_error'].mean()}, num of corrects:  {df_check[df_check['Total_Mean_error'] == 0].shape[0]}")
print(f"Avg Total prophetic error: {df_check['prophetic_error'].mean()}, num of corrects: {df_check[df_check['prophetic_error'] == 0].shape[0]}")
print(f"Avg Total nonprophetic error: {df_check['nonprophetic_error'].mean()}, num of corrects: {df_check[df_check['nonprophetic_error'] == 0].shape[0]}")
print(f"{df_check['Sum_error'].sum()}")
df_check.sort_values("Total_Mean_error",ascending=False).head(20)

Number of exact matches: 282 out of 1560 , Percentage: 18.076923076923077
Number of exact num of patent extracted: 581 out of 1560
Avg Total error: 2.9782051282051283, num of corrects:  581
Avg Total prophetic error: 2.366025641025641, num of corrects: 609
Avg Total nonprophetic error: 3.8365384615384617, num of corrects: 456
14322.0


Unnamed: 0,patentnumber,present,past,Unknown,prophetic,nonprophetic,allprophetic,someprophetic,Total_Extracted,Total_Freilich,prophetic_error,nonprophetic_error,Total_Mean_error,Sum_error
289,8927710,30,223,350,0.0,564.0,0.0,0.0,253,564.0,30.0,341.0,311.0,682.0
582,8933105,3,266,0,0.0,108.0,0.0,0.0,269,108.0,3.0,158.0,161.0,322.0
578,8933099,0,132,0,0.0,33.0,0.0,0.0,132,33.0,0.0,99.0,99.0,198.0
173,8927484,1,95,2,0.0,2.0,0.0,0.0,96,2.0,1.0,93.0,94.0,188.0
202,8927546,26,492,0,0.0,428.0,0.0,0.0,518,428.0,26.0,64.0,90.0,180.0
1102,8940778,117,404,0,0.0,601.0,0.0,0.0,521,601.0,117.0,197.0,80.0,394.0
1448,8946210,81,175,0,0.0,177.0,0.0,0.0,256,177.0,81.0,2.0,79.0,162.0
1449,8946211,69,187,0,0.0,177.0,0.0,0.0,256,177.0,69.0,10.0,79.0,158.0
1065,8940725,19,28,82,0.0,121.0,0.0,0.0,47,121.0,19.0,93.0,74.0,186.0
1007,8940412,4,75,0,0.0,12.0,0.0,0.0,79,12.0,4.0,63.0,67.0,134.0


In [None]:
## Method 3
def extract_examples_from_heading(heading):
    """
    Extract examples from a heading and its siblings using multiple methods.
    
    Args:
        heading: BeautifulSoup tag object representing the heading
        
    Returns:
        list: List of extracted examples
    """
    def validate_examples(examples):
        """Helper to validate extracted examples have content"""
        if not examples:
            return False
        if isinstance(examples, list):
            return bool(examples and examples[0].get("content"))
        return bool(examples.get("content"))
    
    extracted_examples = []
    
    try:
        # Get siblings once
        siblings = heading.find_next_siblings()
        if not siblings:
            return []
            
        # Try different extraction methods in order of preference
        extraction_methods = [
            ("process_siblings", lambda: process_siblings(siblings)),
            ("extract_examples_start_w_word", lambda: extract_examples_start_w_word(siblings)),
            ("extract_num_dot_examples", lambda: extract_num_dot_examples(str(siblings)))
        ]
        
        for method_name, extractor in extraction_methods:
            examples = extractor()
            if validate_examples(examples):
                if isinstance(examples, list):
                    extracted_examples.extend(examples)
                else:
                    extracted_examples.append(examples)
                break  # Stop after first successful extraction
                
    except Exception as e:
        print(f"Error extracting from heading: {str(e)}")
    
    return extracted_examples

def extract_examples(xml):
    """
    Extract examples from patent XML using multiple methods.
    
    Args:
        xml: Patent XML string
        
    Returns:
        list: List of extracted examples
    """
    def validate_content(examples):
        """Helper to validate example content"""
        if not examples:
            return False
        if isinstance(examples, list):
            return bool(examples and examples[0].get("content"))
        return bool(examples.get("content"))
    
    try:
        # Try extracting from section headings first
        headings = extract_experiments_w_heading(xml)
        
        if headings:
            if len(headings) > 1:
                # Multiple headings - process each one
                all_examples = []
                for heading in headings:
                    examples = extract_examples_from_heading(heading)
                    if examples:
                        all_examples.extend(examples)
                return all_examples if all_examples else []
            
            elif len(headings) == 1:
                # Single heading - process normally
                examples = extract_examples_from_heading(headings[0])
                return examples if examples else []
        
        # If no headings found or no content extracted, try other methods
        extraction_methods = [
            ("extract_examples_w_word", lambda: extract_examples_w_word(xml)),
            ("extract_num_dot_examples", lambda: extract_num_dot_examples(xml))
        ]
        
        for method_name, extractor in extraction_methods:
            examples = extractor()
            if validate_content(examples):
                return [examples] if not isinstance(examples, list) else [examples]
                
        return []
        
    except Exception as e:
        print(f"Error extracting examples: {str(e)}")
        return []

# Process documents
def process_patent_documents(xml_list):
    """
    Process a list of patent XMLs and extract examples.
    
    Args:
        xml_list: List of patent XML strings
        
    Returns:
        dict: Dictionary of document numbers and their examples
    """
    doc_w_exp = {}
    total = len(xml_list)
    
    for i, xml in enumerate(xml_list, start=1):
        try:
            if i % 1000 == 0:
                print(f"Processing {i}/{total} - Found {len(doc_w_exp)} docs with experiments")
            
            doc_number = find_doc_number(xml)
            if not doc_number:
                continue
                
            extracted_examples = extract_examples(xml)
            if extracted_examples:
                doc_w_exp[doc_number] = {
                    'examples': extracted_examples,
                    'count': len(extracted_examples),
                    'methods_used': [ex.get('method', 'unknown') for ex in extracted_examples]
                }
                
        except Exception as e:
            print(f"Error processing document {i}: {str(e)}")
            continue
            
    return doc_w_exp

# Usage
results = process_patent_documents(merged["xml"].tolist())