In [29]:
import numpy as np
import os
import pandas as pd

from datetime import datetime

# Utilities

In [30]:
np.random.seed(69)

In [31]:
def read_csvs(dir: str) -> pd.DataFrame:
    """Read all CSVs in a directory onto a single DataFrame."""
    
    dfs = []
    
    for file in os.listdir(dir):
        if not file.endswith(".csv"):
            continue
        path = os.path.join(dir, file)
        dfs.append(pd.read_csv(path))

    return pd.concat(dfs)

# Gemini

In [32]:
gemini = read_csvs("../data/gemini/")
gemini.head()

Unnamed: 0,Task,Category
0,Finish Quarterly Report,Urgent and Important
1,Schedule Meeting with Client X,Urgent and Important
2,Respond to Urgent Email,Urgent but Not Important
3,Clean Desk,Neither Urgent nor Important
4,Learn New Programming Language,Important but Not Urgent


In [33]:
gemini.describe()

Unnamed: 0,Task,Category
count,495,495
unique,177,9
top,Update Resume,Important but Not Urgent
freq,6,154


In [34]:
gemini.drop_duplicates(inplace=True)
gemini.describe()

Unnamed: 0,Task,Category
count,278,278
unique,177,9
top,Exercise Regularly,Important but Not Urgent
freq,3,93


In [35]:
gemini["Category"].unique()

array(['Urgent and Important', 'Urgent but Not Important',
       'Neither Urgent nor Important', 'Important but Not Urgent',
       'Important but Not Important', ' Urgent and Important',
       ' Important but Not Urgent', ' Neither Urgent nor Important',
       ' Urgent but Not Important'], dtype=object)

In [36]:
gemini = gemini.map(lambda x: x.strip())
gemini.describe()

Unnamed: 0,Task,Category
count,278,278
unique,177,5
top,Exercise Regularly,Important but Not Urgent
freq,3,143


In [37]:
gemini["Category"].unique()

array(['Urgent and Important', 'Urgent but Not Important',
       'Neither Urgent nor Important', 'Important but Not Urgent',
       'Important but Not Important'], dtype=object)

In [38]:
gemini["Category"] = gemini["Category"].replace({"Important but Not Important": "Important but Not Urgent"})
gemini["Category"].unique()

array(['Urgent and Important', 'Urgent but Not Important',
       'Neither Urgent nor Important', 'Important but Not Urgent'],
      dtype=object)

In [39]:
gemini.describe()

Unnamed: 0,Task,Category
count,278,278
unique,177,4
top,Exercise Regularly,Important but Not Urgent
freq,3,183


In [40]:
gemini.drop_duplicates(subset="Task", inplace=True)
gemini.describe()

Unnamed: 0,Task,Category
count,177,177
unique,177,4
top,Finish Quarterly Report,Important but Not Urgent
freq,1,96


In [41]:
gemini["Category"] = gemini["Category"].map({
    "Urgent and Important": "urgent important", 
    "Urgent but Not Important": "urgent not-important",
    "Neither Urgent nor Important": "not-urgent not-important", 
    "Important but Not Urgent": "not-urgent important",
})

gemini.sample(10)

Unnamed: 0,Task,Category
87,Develop Targeted Marketing Campaigns,urgent important
31,Learn New Skill,not-urgent important
32,Attend Webinar,not-urgent important
115,Solve Practice Problems,not-urgent important
57,Improve Customer Satisfaction,urgent important
13,Fix Broken Printer,urgent not-important
47,Create Email Marketing Campaign,urgent important
8,Attend Networking Event,not-urgent important
56,Analyze Sales Data,not-urgent important
102,Volunteer at Local Animal Shelter,not-urgent important


# GPT

In [42]:
gpt = read_csvs("../data/gpt/")
gpt.head()

Unnamed: 0,Task,Category
0,Respond to urgent emails,Urgent & Important
1,Prepare project report,Urgent & Important
2,Plan next month's budget,Important but Not Urgent
3,Learn a new software tool,Important but Not Urgent
4,Attend team meeting,Urgent & Important


In [43]:
gpt.describe()

Unnamed: 0,Task,Category
count,504,504
unique,492,4
top,Update LinkedIn profile,Important but Not Urgent
freq,3,265


In [44]:
gpt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 504 entries, 0 to 95
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Task      504 non-null    object
 1   Category  504 non-null    object
dtypes: object(2)
memory usage: 11.8+ KB


In [45]:
gpt.drop_duplicates(subset="Task", inplace=True)
gpt.describe()

Unnamed: 0,Task,Category
count,492,492
unique,492,4
top,Respond to urgent emails,Important but Not Urgent
freq,1,260


In [46]:
gpt["Category"].unique()

array(['Urgent & Important', 'Important but Not Urgent',
       'Urgent but Not Important', 'Not Urgent & Not Important'],
      dtype=object)

In [47]:
gpt["Category"] = gpt["Category"].map({
    "Urgent & Important": "urgent important",
    "Important but Not Urgent": "not-urgent important",
    "Urgent but Not Important": "urgent not-important",
    "Not Urgent & Not Important": "not-urgent not-important",
})

gpt.sample(10)

Unnamed: 0,Task,Category
4,Organize department files,not-urgent not-important
78,Record meeting notes,urgent not-important
13,Proofread essay before submission,urgent important
45,Review company guidelines,not-urgent not-important
10,Reconcile company credit card transactions,urgent important
69,Schedule project milestones,not-urgent important
69,Create email templates for client responses,not-urgent important
39,Schedule social media posts,urgent not-important
5,Create quarterly project plan,not-urgent important
7,Submit class project,urgent important


# Claude

In [48]:
claude = read_csvs("../data/claude/")
claude.head()

Unnamed: 0,Task,Category
0,Complete quarterly financial report,Important & Urgent
1,Respond to client emergency,Important & Urgent
2,Submit tax documents,Important & Urgent
3,Fix critical system bug,Important & Urgent
4,Attend emergency team meeting,Important & Urgent


In [49]:
claude.describe()

Unnamed: 0,Task,Category
count,596,596
unique,582,4
top,Organize digital notes,Not Important & Not Urgent
freq,3,321


In [50]:
claude.drop_duplicates(subset="Task", inplace=True)
claude.describe()

Unnamed: 0,Task,Category
count,582,582
unique,582,4
top,Complete quarterly financial report,Not Important & Not Urgent
freq,1,311


In [51]:
claude["Category"].unique()

array(['Important & Urgent', 'Important & Not Urgent',
       'Not Important & Urgent', 'Not Important & Not Urgent'],
      dtype=object)

In [52]:
claude["Category"] = claude["Category"].map({
    "Important & Urgent": "urgent important",
    "Important & Not Urgent": "not-urgent important",
    "Not Important & Urgent": "urgent not-important",
    "Not Important & Not Urgent": "not-urgent not-important",
})

gpt.sample(10)

Unnamed: 0,Task,Category
26,Plan weekend study session,not-urgent important
64,Respond to email inquiries,urgent important
49,Schedule exam preparation time,not-urgent important
42,Organize files by client or project,not-urgent not-important
51,Brainstorm club activity ideas,not-urgent not-important
43,Check office maintenance requests,urgent important
18,Arrange office supplies restock,urgent not-important
0,Respond to urgent emails,urgent important
70,Look up class study resources,not-urgent not-important
18,Set reminders for exam dates,not-urgent important


# Manual

In [53]:
manual = pd.read_csv("../data/manual.csv")
manual.head()

Unnamed: 0,Task,Category
0,"Browse and buy a birthday present for her mom,...",not-urgent important


# Combine

In [54]:
combined = pd.concat([gemini, gpt, claude, manual])
combined.describe()

Unnamed: 0,Task,Category
count,1252,1252
unique,1245,4
top,Update contact list,not-urgent important
freq,2,456


In [55]:
now = datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S")
combined.to_csv(f"../data/clean/{now}.csv", index=False)