### Importing Libraries

In [44]:
import spacy
import re
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from nltk import pos_tag, word_tokenize
from nltk.chunk import ne_chunk

In [34]:
nlp = spacy.load("en_core_web_lg")


In [35]:

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jatin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Text Preprocessing

In [36]:
def preprocess_text(text):
    """
    Cleans the input text by removing stopwords, punctuations, and tokenizing it.
    """
    # Remove special characters & numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Convert to lowercase
    text = text.lower()

    # Tokenize words and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    
    return " ".join(tokens)




In [None]:
text="""Rahul wakes up early every day. He goes to college in the morning and comes back at 3 pm. At present, Rahul is outside. He has to buy the snacks for all of us."""



cleaned_text = preprocess_text(text)
print(cleaned_text)

rahul wakes early every day goes college morning comes back pm present rahul outside buy snacks us


In [58]:
def get_sentences(text):
    """
    Tokenizes the text into sentences using spaCy.
    """
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

# Tokenizing sentences
sentences = get_sentences(text)
print(sentences)


['Rahul wakes up \nearly every day.', 'He goes to college in the morning and comes back at 3 pm.', 'At present, Rahul is \noutside.', 'He has to buy the snacks for all of us.']


### Task extraction 

In [59]:
task_keywords = {"must", "should", "has to", "needs to", "required to", "is to", "have to", "is supposed to"}

def extract_tasks(sentences):
    """
    Extracts sentences that contain task indicators and tracks the last mentioned entity.
    """
    tasks = []
    last_subject = None  # Store last identified person

    for sentence in sentences:
        words = sentence.lower().split()  # Convert to lowercase for better matching

        # Check for proper nouns (names like "Rahul") before checking tasks
        doc = nlp(sentence)
        for token in doc:
            if token.pos_ == "PROPN":  
                last_subject = token.text  # Update the last known subject
        
        # If the sentence contains a task keyword, store it
        if any(keyword in sentence for keyword in task_keywords):
            tasks.append((sentence, last_subject))  # Store the sentence along with the subject
    
    return tasks


tasks = extract_tasks(sentences)
print(tasks)



[('He has to buy the snacks for all of us.', 'Rahul')]


### Entity and deadline extraction 

In [60]:



def get_task_entity(task_tuple, last_subject=None):
    sentence, last_subject = task_tuple
    doc = nlp(sentence)

    # Extract named entities labeled as PERSON
    entities = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    if entities:
        return entities[0]

    # Fallback to proper nouns in SpaCy
    proper_nouns = [token.text for token in doc if token.pos_ == "PROPN"]
    if proper_nouns:
        return proper_nouns[0]

    # Use NLTK NER as a backup
    nltk_entities = ne_chunk(pos_tag(word_tokenize(sentence)))
    for subtree in nltk_entities:
        if isinstance(subtree, nltk.Tree) and subtree.label() == "PERSON":
            return " ".join([token for token, pos in subtree.leaves()])

    return last_subject

task_entities = {task[0]: get_task_entity(task) for task in tasks}
print(task_entities)

{'He has to buy the snacks for all of us.': 'Rahul'}


In [61]:
def get_task_deadline(sentence):
    """
    Extracts all deadline-related information.
    """
    match=None
    doc = nlp(sentence)
    deadlines = [ent.text for ent in doc.ents if ent.label_ in ["DATE", "TIME"]]
    if "before" in sentence or "by" in sentence or "until" in sentence:
        match = re.search(r"(before|by|until)\s+([0-9A-Za-z\s]+)", sentence)
    if match:
        deadlines.append(match.group(0))  # Capture "before 5 PM"
    return ", ".join(deadlines) if deadlines else "No deadline specified"



task_deadlines = {task[0]: get_task_deadline(task[0]) for task in tasks}
print(task_deadlines)


{'He has to buy the snacks for all of us.': 'No deadline specified'}


### Task categorization

In [62]:
task_categories = {
    "Work":["report", "assignment", "presentation", "submit", "form"],
    "Shopping": ["buy", "purchase"],
    "Cleaning": ["clean", "wash"],
    "Communication": ["call", "email", "message"]
}

def categorize_task(sentence):
    """
    Categorizes tasks based on predefined keyword mappings.
    """
    sentence_lower = str(sentence).lower()  # Ensure it's a string
    for category, keywords in task_categories.items():
        if any(word in sentence_lower for word in keywords):
            return category
    return "Uncategorized"

# Ensure tasks is a list of strings, not tuples
task_categories = {task[0]: categorize_task(task[0]) for task in tasks}  # Use task[0] if tasks contain tuples
print(task_categories)


{'He has to buy the snacks for all of us.': 'Shopping'}


### Final summary

In [73]:
def generate_task_summary(tasks, task_entities, task_deadlines, task_categories):
    """
    Generates structured output for extracted tasks.
    """
    extracted_tasks = []
    for task in tasks:
        task_sentence = task[0]  # Extract the actual sentence
        if task[1]:  # If a valid subject is found (e.g., Rahul)
            task_text = re.sub(rf"\b({re.escape(task[1])}|he|she|they|it)\b", "", task_sentence, flags=re.IGNORECASE).strip()

        extracted_tasks.append({

            "Task": task_text,
            "Who": task_entities.get(task_sentence, "Unknown"),
            "Deadline": task_deadlines.get(task_sentence, "No deadline"),
            "Category": task_categories.get(task_sentence, "Uncategorized")
        })
    
    return extracted_tasks


structured_tasks = generate_task_summary(tasks, task_entities, task_deadlines, task_categories)
for task in structured_tasks:
    print(task)



{'Task': 'has to buy the snacks for all of us.', 'Who': 'Rahul', 'Deadline': 'No deadline specified', 'Category': 'Shopping'}
