In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd

# Download required NLTK datasets
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Define predefined task categories
CATEGORY_WORDS = {
    "Shopping": ["buy", "purchase", "order", "pick"],
    "Work": ["submit", "send", "email", "schedule", "prepare", "review", "discuss"],
    "Household": ["clean", "wash", "arrange", "organize", "fix"],
    "Education": ["study", "attend", "read", "write", "review"],
    "Finance": ["pay", "deposit", "withdraw", "invest", "transfer"]
}

TASK_KEYWORDS = {"has to", "should", "must", "needs to", "is required to"}
DEADLINE_INDICATORS = {"by", "before", "at", "on", "until"}
TIME_KEYWORDS = {"tomorrow", "next", "today", "morning", "afternoon", "evening", "pm", "am",
                 "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"}
STOPWORDS = set(stopwords.words("english"))

# Step 1: Preprocessing
def preprocess_text(text):
    """Clean text by removing stop words, punctuation, and tokenizing into sentences."""
    sentences = sent_tokenize(text)
    cleaned_sentences = []

    for sent in sentences:
        sent = re.sub(r"[^\w\s]", "", sent.strip())  # Remove punctuation
        words = word_tokenize(sent)
        cleaned_sentences.append(" ".join(words))

    return cleaned_sentences

# Step 2: Identify Task Sentences
def identify_tasks(sentences):
    """Identify sentences that represent actionable tasks based on heuristics."""
    actionable_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        pos_tags = nltk.pos_tag(words)

        if pos_tags and (pos_tags[0][1] == "VB" or 
                         (pos_tags[0][1] in ["MD", "VB"] and len(pos_tags) > 1 and pos_tags[1][1].startswith("VB"))):
            actionable_sentences.append(sentence)
            continue
        
        if any(keyword in sentence.lower() for keyword in TASK_KEYWORDS):
            actionable_sentences.append(sentence)
            continue

    return actionable_sentences

# Step 3: Extract Entities
def extract_person(words, pos_tags):
    """Extract the responsible person from a sentence using POS tagging."""
    for i, (word, tag) in enumerate(pos_tags):
        if tag == "NNP":  # Proper noun (Name)
            return word
        if i == 0 and word[0].isupper() and word.lower() not in TIME_KEYWORDS:
            return word
    return None  

def extract_action(words, pos_tags):
    """Extract the main action verb from a sentence."""
    for word, tag in pos_tags:
        if tag.startswith("VB") and word.lower() not in ["has", "needs", "must", "should"]:  
            return word
    return None

def extract_deadline(words):
    """Extract deadline from a sentence based on time-related words."""
    for i, word in enumerate(words):
        if word in DEADLINE_INDICATORS and i + 1 < len(words):
            return " ".join(words[i:i+2])  
        if word in TIME_KEYWORDS:
            return word.capitalize()  
    return None

def categorize_task(action):
    """Categorize a task based on predefined category words."""
    for category, words in CATEGORY_WORDS.items():
        if action in words:  # If action matches a category word
            return category
    return "Other"  

def extract_entities(sentence):
    """Extract structured details from a task sentence."""
    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)

    person = extract_person(words, pos_tags)  # Extract person
    action = extract_action(words, pos_tags)  # Extract action
    deadline = extract_deadline(words)  # Extract deadline
    category = categorize_task(action)  # Categorize task

    return {"Task": sentence, "Person": person, "Action": action, "Deadline": deadline, "Category": category}

# Sample Input
text = """David needs to submit the quarterly financial report by Monday. 
Sarah should clean the living room before the guests arrive. 
Alex must send the project proposal to the manager. 
John wants to buy a new laptop tomorrow. 
Emma is planning a vacation next week."""

# Step 1: Preprocess text
clean_sentences = preprocess_text(text)

# Step 2: Identify tasks
tasks = identify_tasks(clean_sentences)

# Step 3: Extract and display tasks
extracted_tasks = [extract_entities(task) for task in tasks]

# Convert results to DataFrame for better visualization
df = pd.DataFrame(extracted_tasks)

# Display structured results
display(df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Task,Person,Action,Deadline,Category
0,Jake must call the plumber before noon,Jake,call,before noon,Other
1,Emma should finish reading the research paper,Emma,finish,,Other
2,Tom needs to prepare for his upcoming job inte...,Tom,prepare,,Work
