In [24]:
!pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
#or use rtf for accuracy

Collecting en-core-web-sm==any
  Downloading https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m917.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [33]:
import json
import spacy
from dateparser import parse as date_parse

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    """
    Parse the user text using spaCy and extract:
    - Participants (PERSON entities)
    - Date and Time (DATE, TIME entities) via spaCy + dateparser
    - Task type (using a simple rule-based approach)
    - Priority (using a simple rule-based approach)
    - Locations: from spaCy (if labeled as FAC, GPE, LOC, or ORG)
      and from rule-based detection if the words "at" or "to" are followed by a noun phrase,
      with special checks for "to" (to avoid catching infinitive markers).
    - Description (fallback to the original text)
    """
    doc = nlp(text)

    # Prepare a dictionary to hold extracted info
    extracted = {
        "task": None,
        "participants": [],
        "date": None,
        "time": None,
        "priority": None,
        "locations": [],
        "description": None
    }

    # Define location labels from spaCy
    location_labels = {"FAC", "GPE", "LOC", "ORG"}

    # Identify named entities from spaCy
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            extracted["participants"].append(ent.text)
        elif ent.label_ == "DATE":
            dt = date_parse(ent.text)
            if dt:
                extracted["date"] = dt.strftime("%Y-%m-%d")
        elif ent.label_ == "TIME":
            dt = date_parse(ent.text)
            if dt:
                extracted["time"] = dt.strftime("%H:%M")
        elif ent.label_ in location_labels:
            extracted["locations"].append(ent.text)

    # Simple rule-based approach for task and priority
    task_keywords = ["meeting", "call", "email", "reminder", "schedule"]
    priority_keywords = ["urgent", "high-priority", "low-priority"]

    for token in doc:
        # Check if token matches any known task keywords
        if token.lemma_.lower() in task_keywords:
            extracted["task"] = token.lemma_.lower()
        # Check for priority keywords
        if token.lemma_.lower() in priority_keywords:
            extracted["priority"] = token.lemma_.lower()

    # Set a default priority if none was found
    if not extracted["priority"]:
        extracted["priority"] = "normal"

    # Rule-based detection for locations after "at" or "to"
    for token in doc:
        if token.text.lower() in {"at", "to"} and token.i < len(doc) - 1:
            # For "to", skip if the following token is a verb (likely an infinitive marker)
            if token.text.lower() == "to" and doc[token.i+1].pos_ == "VERB":
                continue
            # Also skip if the immediate next token is labeled as TIME
            if doc[token.i+1].ent_type_ == "TIME":
                continue

            candidate = None
            # Attempt to get the following noun chunk as the location
            for chunk in doc.noun_chunks:
                if chunk.start == token.i + 1:
                    candidate = chunk.text
                    break
            # If no noun chunk was found, just use the immediate token.
            if not candidate:
                candidate = doc[token.i+1].text

            if candidate not in extracted["locations"]:
                extracted["locations"].append(candidate)

    # For the description, store the entire original text
    extracted["description"] = text

    return extracted

# Main testing code
if __name__ == "__main__":
    # Some example inputs to test
    test_sentences = [
        "remind me to go to the church for CS225 today at 10am with Ashley",
        "Schedule a meeting at the downtown office with John next Tuesday",
        "Me and Jane need to go to UIUC test tomorrow."
    ]

    for sentence in test_sentences:
        result = extract_entities(sentence)
        print(f"\nInput: {sentence}")
        print("Extracted Entities:", result)



Input: remind me to go to the church for CS225 today at 10am with Ashley
Extracted Entities: {'task': None, 'participants': ['Ashley'], 'date': '2025-03-02', 'time': '10:00', 'priority': 'normal', 'locations': ['the church'], 'description': 'remind me to go to the church for CS225 today at 10am with Ashley'}

Input: Schedule a meeting at the downtown office with John next Tuesday
Extracted Entities: {'task': 'meeting', 'participants': ['John'], 'date': None, 'time': None, 'priority': 'normal', 'locations': ['the downtown office'], 'description': 'Schedule a meeting at the downtown office with John next Tuesday'}

Input: Me and Jane need to go to UIUC test tomorrow.
Extracted Entities: {'task': None, 'participants': ['Jane'], 'date': '2025-03-03', 'time': None, 'priority': 'normal', 'locations': ['UIUC', 'UIUC test'], 'description': 'Me and Jane need to go to UIUC test tomorrow.'}
