# TASK A : Extract and Categorize Tasks from Unannotated Text

In [1]:
!pip install nltk



## Importing Required Libraries

In [2]:
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from collections import defaultdict

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\SANJAY
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SANJAY KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to C:\Users\SANJAY
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Sample text

In [4]:
text = "Rahul wakes up early every day. Rahul must goes to college in the morning and comes back at 3 pm. At present, Rahul is outside and has to buy the snacks for all of us. Rahul should clean the room by 5 pm today."

## Preprocessing
- Clean the text (remove stop words, punctuation, and irrelevant metadata).
- Tokenize sentences and perform POS tagging to identify actionable verbs (e.g., 
"schedule," "discuss," "review").

In [5]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
        cleaned_sentences.append(' '.join(words))
    
    return cleaned_sentences

## Task Identification
- Develop heuristics to identify sentences likely representing tasks.
- Identify task-related sentences using heuristic-based approach.
- Extract the person responsible and the deadline if available.

In [5]:
def extract_tasks(text):
    sentences = sent_tokenize(text)
    task_sentences = []
    task_keywords = {"has to", "should", "must", "need to", "required to", "scheduled to"}
    
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in task_keywords):
            task_sentences.append(sentence)
    
    return task_sentences

In [6]:
def extract_entities(sentence):
    words = word_tokenize(sentence)
    tagged_words = pos_tag(words)
    responsible_person = None
    deadline = None
    
    for i, (word, tag) in enumerate(tagged_words):
        if tag == 'NNP':  # Proper Noun as a potential person
            responsible_person = word
        if word.lower() in {'by', 'before', 'at'} and i + 1 < len(tagged_words):
            deadline = ' '.join(words[i+1:i+3])  # Get deadline phrase (simple heuristic)
    
    return responsible_person, deadline

## Processing the text

In [11]:
cleaned_sentences = preprocess_text(text)
tasks = extract_tasks(text)
structured_tasks = []

In [12]:
for task in tasks:
    person, time = extract_entities(task)
    structured_tasks.append({"task": task, "who": person, "when": time})

## Output results
- Generate a structured list of tasks with categories. Also, for the tasks that have information, list who is to do that task and when is the deadline.

In [13]:
for task in structured_tasks:
    print(f"Task: {task['task']}")
    print(f"Who: {task['who'] if task['who'] else 'Not mentioned'}")
    print(f"When: {task['when'] if task['when'] else 'Not mentioned'}")
    print("-")

Task: Rahul must goes to college in the morning and comes back at 3 pm.
Who: Rahul
When: 3 pm
-
Task: At present, Rahul is outside and has to buy the snacks for all of us.
Who: Rahul
When: present ,
-
Task: Rahul should clean the room by 5 pm today.
Who: Rahul
When: 5 pm
-
