In [1]:
!pip install spacy pandas nltk



In [2]:
import pandas as pd
import spacy
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from spacy import displacy

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")

# Load Spacy's NLP model
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Define a function for text preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text
df=pd.read_csv("task_dataset_100.csv")
# Apply preprocessing
df["Cleaned Text"] = df["Raw Text"].apply(preprocess_text)

# Save processed data
df.to_csv("processed_task_dataset.csv", index=False)
print("✅ Preprocessing done! Saved as 'processed_task_dataset.csv'.")


✅ Preprocessing done! Saved as 'processed_task_dataset.csv'.


In [12]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [13]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk

# Download necessary NLTK resources
nltk.download('punkt')

# Define a list of common action verbs that indicate a task
action_verbs = ["buy", "submit", "clean", "finish", "schedule", "send", "prepare", "book", "review", "fix"]

# Function to identify sentences that contain a task
def extract_tasks(text):
    sentences = sent_tokenize(text)  # Tokenize into sentences
    task_sentences = []

    for sentence in sentences:
        words = sentence.lower().split()

        # Rule: Check if an action verb is present
        if any(verb in words for verb in action_verbs):
            task_sentences.append(sentence)

    return task_sentences if task_sentences else None

# Load dataset
df = pd.read_csv("task_dataset_100.csv")  # Ensure you have this CSV file in the same directory

# Apply the function to extract tasks from the "Raw Text" column
df["Extracted Task"] = df["Raw Text"].apply(extract_tasks)

# Save the extracted tasks to a new CSV
df.to_csv("extracted_tasks.csv", index=False)
print("✅ Task extraction complete! Saved as 'extracted_tasks.csv'.")


✅ Task extraction complete! Saved as 'extracted_tasks.csv'.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# Define categories based on keywords
categories = {
    "Shopping": ["buy", "order"],
    "Work": ["submit", "prepare", "review", "complete"],
    "Household": ["clean", "fix", "repair"],
    "Meetings": ["schedule", "send", "arrange"],
    "Personal": ["book", "call"]
}

# Function to categorize a task
def categorize_task(task):
    if not task: return "Other"

    for category, keywords in categories.items():
        if any(keyword in task.lower() for keyword in keywords):
            return category
    return "Other"

# Apply categorization
df["Category"] = df["Extracted Task"].apply(lambda x: categorize_task(str(x)))

# Save categorized tasks
df.to_csv("categorized_tasks.csv", index=False)
print("✅ Task categorization complete! Saved as 'categorized_tasks.csv'.")


✅ Task categorization complete! Saved as 'categorized_tasks.csv'.


In [15]:
# Function to extract names and deadlines
def extract_person_and_deadline(text):
    doc = nlp(text)
    person = None
    deadline = None

    for ent in doc.ents:
        if ent.label_ == "PERSON":  # Extract person
            person = ent.text
        elif ent.label_ in ["DATE", "TIME"]:  # Extract deadline
            deadline = ent.text

    return person, deadline

# Apply function to extract person and deadline
df["Person"], df["Deadline"] = zip(*df["Raw Text"].apply(extract_person_and_deadline))

# Save extracted data
df.to_csv("final_tasks.csv", index=False)
print("✅ Task extraction with people and deadlines complete! Saved as 'final_tasks.csv'.")


✅ Task extraction with people and deadlines complete! Saved as 'final_tasks.csv'.


In [16]:
print(df[["Raw Text", "Extracted Task", "Person", "Deadline", "Category"]].head(10))


                                            Raw Text  \
0  The manager assigned David to prepare the meet...   
1  Daniel has to submit the tax documents by Apri...   
2      Ava needs to clean the garden this afternoon.   
3      Ava needs to clean the garden this afternoon.   
4  Noah should prepare a presentation for the con...   
5    Liam must schedule the meeting with the vendor.   
6  Olivia should finalize the budget report befor...   
7             John must submit the report by Monday.   
8  Noah should prepare a presentation for the con...   
9  The manager assigned David to prepare the meet...   

                                      Extracted Task Person        Deadline  \
0  [The manager assigned David to prepare the mee...  David            None   
1  [Daniel has to submit the tax documents by Apr...   None        April 15   
2    [Ava needs to clean the garden this afternoon.]   None  this afternoon   
3    [Ava needs to clean the garden this afternoon.]   None  this a