In [152]:
import numpy as np
import os
import pandas as pd

from datetime import datetime
import utils

# Utilities

In [153]:
np.random.seed(69)

In [154]:
def read_csvs(dir: str) -> pd.DataFrame:
    """Read all CSVs in a directory onto a single DataFrame."""
    
    dfs = []
    
    for file in os.listdir(dir):
        if not file.endswith(".csv"):
            continue
        path = os.path.join(dir, file)
        dfs.append(pd.read_csv(path))

    return pd.concat(dfs)

# Gemini

In [155]:
gemini = read_csvs("../data/gemini/")
gemini.head()

Unnamed: 0,Task,Category
0,Finish Quarterly Report,Urgent and Important
1,Schedule Meeting with Client X,Urgent and Important
2,Respond to Urgent Email,Urgent but Not Important
3,Clean Desk,Neither Urgent nor Important
4,Learn New Programming Language,Important but Not Urgent


In [156]:
gemini.describe()

Unnamed: 0,Task,Category
count,495,495
unique,177,9
top,Update Resume,Important but Not Urgent
freq,6,154


In [157]:
gemini.drop_duplicates(inplace=True)
gemini.describe()

Unnamed: 0,Task,Category
count,278,278
unique,177,9
top,Exercise Regularly,Important but Not Urgent
freq,3,93


In [158]:
gemini["Category"].unique()

array(['Urgent and Important', 'Urgent but Not Important',
       'Neither Urgent nor Important', 'Important but Not Urgent',
       'Important but Not Important', ' Urgent and Important',
       ' Important but Not Urgent', ' Neither Urgent nor Important',
       ' Urgent but Not Important'], dtype=object)

In [159]:
gemini = gemini.map(lambda x: x.strip())
gemini.describe()

Unnamed: 0,Task,Category
count,278,278
unique,177,5
top,Exercise Regularly,Important but Not Urgent
freq,3,143


In [160]:
gemini["Category"].unique()

array(['Urgent and Important', 'Urgent but Not Important',
       'Neither Urgent nor Important', 'Important but Not Urgent',
       'Important but Not Important'], dtype=object)

In [161]:
gemini["Category"] = gemini["Category"].replace({"Important but Not Important": "Important but Not Urgent"})
gemini["Category"].unique()

array(['Urgent and Important', 'Urgent but Not Important',
       'Neither Urgent nor Important', 'Important but Not Urgent'],
      dtype=object)

In [162]:
gemini.describe()

Unnamed: 0,Task,Category
count,278,278
unique,177,4
top,Exercise Regularly,Important but Not Urgent
freq,3,183


In [163]:
gemini.drop_duplicates(subset="Task", inplace=True)
gemini.describe()

Unnamed: 0,Task,Category
count,177,177
unique,177,4
top,Finish Quarterly Report,Important but Not Urgent
freq,1,96


In [164]:
gemini["Category"] = gemini["Category"].map({
    "Urgent and Important": "urgent important", 
    "Urgent but Not Important": "urgent not-important",
    "Neither Urgent nor Important": "not-urgent not-important", 
    "Important but Not Urgent": "not-urgent important",
})

gemini.sample(10)

Unnamed: 0,Task,Category
87,Develop Targeted Marketing Campaigns,urgent important
31,Learn New Skill,not-urgent important
32,Attend Webinar,not-urgent important
115,Solve Practice Problems,not-urgent important
57,Improve Customer Satisfaction,urgent important
13,Fix Broken Printer,urgent not-important
47,Create Email Marketing Campaign,urgent important
8,Attend Networking Event,not-urgent important
56,Analyze Sales Data,not-urgent important
102,Volunteer at Local Animal Shelter,not-urgent important


# GPT

In [165]:
gpt = read_csvs("../data/gpt/")
gpt.head()

Unnamed: 0,Task,Category
0,Respond to client emails,Urgent & Important
1,Prepare monthly sales report,Urgent & Important
2,Review budget proposal,Important but Not Urgent
3,Attend weekly team meeting,Urgent & Important
4,Organize department files,Not Urgent & Not Important


In [166]:
gpt.describe()

Unnamed: 0,Task,Category
count,2309,2309
unique,2288,5
top,Update LinkedIn profile,Urgent but Not Important
freq,3,619


In [167]:
gpt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2309 entries, 0 to 101
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Task      2309 non-null   object
 1   Category  2309 non-null   object
dtypes: object(2)
memory usage: 54.1+ KB


In [168]:
gpt.drop_duplicates(subset="Task", inplace=True)
gpt.describe()

Unnamed: 0,Task,Category
count,2288,2288
unique,2288,5
top,Respond to client emails,Urgent but Not Important
freq,1,613


In [169]:
gpt["Category"].unique()

array(['Urgent & Important', 'Important but Not Urgent',
       'Not Urgent & Not Important', 'Urgent but Not Important',
       'Urgent but Not Important  '], dtype=object)

In [170]:
gpt["Category"] = gpt["Category"].map({
    "Urgent & Important": "urgent important",
    "Important but Not Urgent": "not-urgent important",
    "Urgent but Not Important": "urgent not-important",
    "Not Urgent & Not Important": "not-urgent not-important",
})

gpt.sample(10)

Unnamed: 0,Task,Category
169,take an online personality quiz,not-urgent not-important
126,prepare the proposal for the university resear...,urgent important
210,check out new tech tools for virtual meetings,not-urgent not-important
679,review lecture notes,urgent not-important
244,submit the client proposal for the new project,urgent important
60,explore funny posts on Reddit,not-urgent not-important
256,approve the new marketing campaign strategy,urgent important
333,check with IT about office Wi-Fi issue,urgent not-important
58,check the weather forecast for fun,not-urgent not-important
264,prepare the lab report for the chemistry exper...,urgent important


# Claude

In [171]:
claude = read_csvs("../data/claude/")
claude.head()

Unnamed: 0,Task,Category
0,Reorganize backpack contents,Not Important & Not Urgent
1,Update phone home screen layout,Not Important & Not Urgent
2,Sort photos from class events,Not Important & Not Urgent
3,Customize notebook covers,Not Important & Not Urgent
4,Organize pencil case,Not Important & Not Urgent


In [172]:
claude.describe()

Unnamed: 0,Task,Category
count,596,596
unique,582,4
top,Organize digital notes,Not Important & Not Urgent
freq,3,321


In [173]:
claude.drop_duplicates(subset="Task", inplace=True)
claude.describe()

Unnamed: 0,Task,Category
count,582,582
unique,582,4
top,Reorganize backpack contents,Not Important & Not Urgent
freq,1,311


In [174]:
claude["Category"].unique()

array(['Not Important & Not Urgent', 'Important & Urgent',
       'Important & Not Urgent', 'Not Important & Urgent'], dtype=object)

In [175]:
claude["Category"] = claude["Category"].map({
    "Important & Urgent": "urgent important",
    "Important & Not Urgent": "not-urgent important",
    "Not Important & Urgent": "urgent not-important",
    "Not Important & Not Urgent": "not-urgent not-important",
})

gpt.sample(10)

Unnamed: 0,Task,Category
575,revisit past exams for practice,urgent not-important
512,email team about project delay,urgent not-important
474,review the draft training modules for accuracy...,urgent important
69,set personal goals for the next month,not-urgent important
111,prepare for the university interview next week,urgent important
225,remind team to set out-of-office,urgent not-important
74,call for water delivery,
228,organize your desk with better storage solutions,not-urgent not-important
38,prepare casual invites,
697,review course prerequisites,urgent not-important


# Manual

In [176]:
manual = pd.read_csv("../data/manual.csv")
manual.head()

Unnamed: 0,Task,Category
0,"Browse and buy a birthday present for her mom,...",not-urgent important
1,Study for Chemistry Quiz,Urgent and Important
2,Read English Literature Book,Important but Not Urgent
3,Practice Guitar,Important but Not Urgent
4,Join Study Group,Important but Not Urgent


In [177]:
manual["Category"].unique()

array(['not-urgent important', 'Urgent and Important',
       'Important but Not Urgent', 'Neither Urgent nor Important',
       'Urgent but Not Important'], dtype=object)

In [178]:
manual["Category"] = manual["Category"].replace({
    "Urgent and Important": "urgent important",
    "Important but Not Urgent": "not-urgent important",
    "Neither Urgent nor Important": "not-urgent not-important",
    "Urgent but Not Important": "urgent not-important"
})

manual.head()

Unnamed: 0,Task,Category
0,"Browse and buy a birthday present for her mom,...",not-urgent important
1,Study for Chemistry Quiz,urgent important
2,Read English Literature Book,not-urgent important
3,Practice Guitar,not-urgent important
4,Join Study Group,not-urgent important


# Combine

In [179]:
combined = pd.concat([gemini, gpt, claude, manual])
combined.describe()

Unnamed: 0,Task,Category
count,3195,3054
unique,3178,4
top,Journal,not-urgent important
freq,2,790


# Apply Lemmatization

In [180]:
combined["Task"] = combined["Task"].apply(utils.lemmatize)
combined.head()

Unnamed: 0,Task,Category
0,finish quarterly report,urgent important
1,schedule meeting with client x,urgent important
2,respond to urgent email,urgent not-important
3,clean desk,not-urgent not-important
4,learn new programming language,not-urgent important


# Save Dataset

In [181]:
now = datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
combined.to_csv(f"../data/clean/{now}.csv", index=False)