In [None]:
import re
import nltk
import gensim
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from datetime import datetime

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Sample text with potential tasks
text = """John needs to complete the report by Monday. 
Sarah should schedule a meeting tomorrow. 
Email the client about the project update. 
Finalize the budget plan this week. 
Tom to review the design document. """

# Step 1: Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return words

# Step 2: Task Identification
def identify_tasks(text):
    sentences = sent_tokenize(text)
    tasks = []
    
    for sentence in sentences:
        words = word_tokenize(sentence)
        pos_tags = pos_tag(words)
        
        # Heuristic: Check if the sentence contains an imperative verb or structured task
        contains_verb = any(tag.startswith("VB") for _, tag in pos_tags)
        contains_person = any(tag == "NNP" for _, tag in pos_tags)
        contains_deadline = any(word in sentence.lower() for word in ["today", "tomorrow", "monday", "week"])
        
        if contains_verb or contains_person or contains_deadline:
            tasks.append(sentence)
    
    return tasks

# Step 3: Extract Key Entities (Person, Action, Deadline)
def extract_entities(sentence):
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)
    person = None
    action = None
    deadline = None
    
    # Extract Person
    named_entities = ne_chunk(pos_tags)
    for chunk in named_entities:
        if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
            person = ' '.join(c[0] for c in chunk)
            break
    
    # Extract Action
    for word, tag in pos_tags:
        if tag.startswith("VB"):  # Verb
            action = word
            break
    
    # Extract Deadline
    deadlines = ["today", "tomorrow", "monday", "week"]
    for word in words:
        if word.lower() in deadlines:
            deadline = word.lower()
            break
    
    return person, action, deadline

# Step 4: Categorization with Word2Vec and LDA
sentences = [preprocess_text(sent) for sent in identify_tasks(text)]
model = Word2Vec(sentences, vector_size=50, window=3, min_count=1, workers=4)

# Prepare LDA model
dictionary = Dictionary(sentences)
corpus = [dictionary.doc2bow(sent) for sent in sentences]
lda = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10)

def categorize_task(sentence):
    bow = dictionary.doc2bow(preprocess_text(sentence))
    topic = max(lda[bow], key=lambda x: x[1])[0]  # Get highest probability topic
    return f"Category {topic}"

# Step 5: Generate Structured Output
structured_tasks = []
for task in identify_tasks(text):
    person, action, deadline = extract_entities(task)
    category = categorize_task(task)
    structured_tasks.append({
        "Task": task,
        "Person": person if person else "Unknown",
        "Action": action if action else "Unknown",
        "Deadline": deadline if deadline else "None",
        "Category": category
    })

# Convert results to a DataFrame
df = pd.DataFrame(structured_tasks)
print(df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


                                           Task   Person    Action  Deadline  \
0  John needs to complete the report by Monday.     John     needs    monday   
1     Sarah should schedule a meeting tomorrow.    Sarah  schedule  tomorrow   
2    Email the client about the project update.  Unknown   Unknown      None   
3           Finalize the budget plan this week.  Unknown  Finalize      week   
4            Tom to review the design document.      Tom    review      None   

     Category  
0  Category 0  
1  Category 1  
2  Category 2  
3  Category 1  
4  Category 1  


# Challenges Faced

#### - Identifying Proper Names: Extracting responsible persons required handling capitalization and POS tagging carefully.

#### - Task Detection Heuristics: Simple keyword-based matching sometimes led to false positives.

#### - Categorization Accuracy: Without pre-trained embeddings, task categorization required fine-tuning.

# Insights Gained
#### - Preprocessing Matters: Removing stopwords and careful tokenization significantly improved entity extraction.

#### - POS Tagging is Crucial: Using POS tags helped refine task detection and person identification.

#### - Iterative Refinement: Debugging outputs for different inputs helped refine heuristics for better accuracy.
