In [1]:
import pandas as pd
from email.parser import Parser
import re

# Load your existing CSV with 'file' and 'message' columns
df = pd.read_csv('enron_emails.csv')

# Function to extract email details from the raw email content
def extract_email_data(raw_email):
    email = Parser().parsestr(raw_email)
    
    
    # Extract the other email components
    return {
        "From": email.get("From"),
        "To": email.get("To"),
        "CC": email.get("CC"),
        "BCC": email.get("BCC"),
        "Subject": email.get("Subject"),
        "Body": email.get_payload()
    }

# Apply the extraction function to each 'message' field in the DataFrame
extracted_data = df['message'].apply(extract_email_data)

# Convert the extracted data (list of dictionaries) into a DataFrame
email_details_df = pd.json_normalize(extracted_data)

# Retain only the original 'file' column and the new extracted columns
df_final = pd.concat([df[['file']], email_details_df], axis=1)

# Save the modified DataFrame to a new CSV
df_final.to_csv('modified_enron_emails.csv', index=False)

print("CSV modified and saved as 'modified_enron_emails.csv'")



CSV modified and saved as 'modified_enron_emails.csv'


In [None]:
import pandas as pd

# Set pandas options for better display
pd.set_option('display.max_colwidth', None)  # Show full column content
pd.set_option('display.width', 1000)  # Set max width of the display
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
pd.set_option('display.max_rows', 10)  # Limit the number of rows shown at once

# Load your CSV file
df = pd.read_csv('modified_enron_emails.csv')

# Define a function to output 10 rows at a time
def display_in_batches(df, batch_size=10):
    total_rows = len(df)
    for start in range(0, total_rows, batch_size):
        # Get the batch of rows
        batch = df.iloc[start:start + batch_size]
        
        # Display the batch of rows
        print(batch.to_string(index=False))  # to_string ensures no ellipsis for truncation
        
        # Wait for user input to show the next batch
        input("\nPress Enter to see the next batch...")

# Call the function to display data in batches of 10
display_in_batches(df)




In [7]:
import pandas as pd
import numpy as np

# Path to your dataset
input_file = 'modified_enron_emails.csv'
output_file = 'modified_enron_emails_cleaned.csv'

print(f"Loading data from {input_file}...")
df = pd.read_csv(input_file)

# Print initial info about columns with NaN
print("\nColumns with NaN values before cleaning:")
for col in df.columns:
    nan_count = df[col].isna().sum()
    if nan_count > 0:
        print(f"Column '{col}': {nan_count} NaN values ({nan_count/len(df)*100:.2f}%)")

# List columns to convert NaN to empty string
string_columns = ['CC', 'BCC', 'To', 'From', 'Subject']
available_columns = [col for col in string_columns if col in df.columns]

# Convert NaN values to empty strings
for col in available_columns:
    df[col] = df[col].fillna('none')
    print(f"Converted NaN values in '{col}' to empty strings")

# Verify the changes
print("\nColumns with NaN values after cleaning:")
for col in df.columns:
    nan_count = df[col].isna().sum()
    if nan_count > 0:
        print(f"Column '{col}': {nan_count} NaN values ({nan_count/len(df)*100:.2f}%)")

# Save the cleaned dataset
df.to_csv(output_file, index=False)
print(f"\nCleaned data saved to {output_file}")
print(f"Total rows: {len(df)}")

Loading data from modified_enron_emails.csv...

Columns with NaN values before cleaning:
Column 'To': 21847 NaN values (4.22%)
Column 'CC': 389520 NaN values (75.28%)
Column 'BCC': 389520 NaN values (75.28%)
Column 'Subject': 19187 NaN values (3.71%)
Converted NaN values in 'CC' to empty strings
Converted NaN values in 'BCC' to empty strings
Converted NaN values in 'To' to empty strings
Converted NaN values in 'From' to empty strings
Converted NaN values in 'Subject' to empty strings

Columns with NaN values after cleaning:

Cleaned data saved to modified_enron_emails_cleaned.csv
Total rows: 517401


In [None]:
import pandas as pd
from tabulate import tabulate
import os

# Load the CSV
df = pd.read_csv('modified_enron_emails_cleaned.csv')

# Add an "Action_Item" column if it doesn't exist
if "Action_Item" not in df.columns:
    df["Action_Item"] = None  # Using None instead of "" for unlabeled

# Output file for labeled data
save_file = "labeled_enron.csv"

# Function to display and label batches
def label_batches(df, batch_size=10):
    # Make a working copy of the main dataframe
    working_df = df.copy()
    
    # Check if we have existing labeled data
    if os.path.exists(save_file):
        labeled_df = pd.read_csv(save_file)
        
        # Get indices of emails that have already been labeled
        merge_columns = [col for col in df.columns if col != 'Action_Item']
        
        # Mark rows that have already been labeled
        already_labeled = pd.merge(working_df, labeled_df, on=merge_columns, how='inner').index
        print(f"Found {labeled} already labeled emails")
        
        # Create fresh output file with headers
        if not already_labeled.empty:
            labeled_df.to_csv(save_file, index=False)
    else:
        # Create an empty file with headers
        working_df.head(0).to_csv(save_file, index=False)
        already_labeled = []
    
    # Filter out already labeled emails
    unlabeled_df = working_df.drop(already_labeled) if labeled > 0 else working_df
    
    total_unlabeled = len(unlabeled_df)
    print(f"Starting labeling process with {total_unlabeled} unlabeled emails")
    
    # Process in batches
    for start in range(0, total_unlabeled, batch_size):
        batch = unlabeled_df.iloc[start:start + batch_size].copy()
        
        # Display batch information
        print("\n" + "="*20 + f" BATCH {start//batch_size + 1}/{(total_unlabeled-1)//batch_size + 1} " + "="*20)
        
        # Create a display version of the batch that hides the 'file' column
        display_columns = [col for col in batch.columns if col.lower() != 'file']
        print(tabulate(batch[display_columns], headers="keys", tablefmt="fancy_grid", showindex=True))
        
        # Get user input for each row
        for i, (idx, row) in enumerate(batch.iterrows()):
            label = input(f"Email {start + i + 1}/{total_unlabeled} - Enter action item (press Enter for 'No action'): ").strip()
            
            # Explicitly mark as "No action" if skipped
            batch.at[idx, "Action_Item"] = label if label else "No action"
            
            # Show the current state of the batch with updated action items
            # This shows the progress as each email is labeled
            if i < len(batch) - 1:  # Don't show after the last item in batch
                print("\nCurrent labeling progress:")
                current_progress = batch.iloc[:i+1].copy()
                display_columns = [col for col in current_progress.columns if col.lower() != 'file']
                print(tabulate(current_progress[display_columns], headers="keys", tablefmt="fancy_grid", showindex=True))
        
        # Append newly labeled data to the CSV
        batch.to_csv(save_file, mode='a', header=False, index=False)
        
        print("\n✅ Batch saved.")
        continue_labeling = input("Continue to next batch? (y/n): ").lower()
        if continue_labeling != 'y':
            break

# Start labeling
label_batches(df)

Found 10 already labeled emails
Starting labeling process with 517391 unlabeled emails

╒════╤═════════════════════════╤════════════════════════════╤══════╤═══════╤═════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╤═══════════════╕
│    │ From                    │ To                         │ CC   │ BCC   │ Subject                                                     │ Body                                                                                                                       │ Action_Item   │
╞════╪═════════════════════════╪════════════════════════════╪══════╪═══════╪═════════════════════════════════════════════════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╪═══════════════╡
│ 10 │ phillip.allen@enron.com │ buck.buckner@honeywell.com │ none │ non

In [1]:
import pandas as pd
from tabulate import tabulate
import os
import numpy as np

# Load the CSV
df = pd.read_csv('modified_enron_emails_cleaned.csv')

# Add an "Action_Item" column if it doesn't exist
if "Action_Item" not in df.columns:
    df["Action_Item"] = None  # Using None instead of "" for unlabeled

# Output file for labeled data
save_file = "test_enron.csv"
already_labeled = 0
# Function to display and label batches
def label_batches(df, batch_size=25):
    # Make a working copy of the main dataframe
    working_df = df.copy()
    # Check if we have existing labeled data
    labeled = 0
    if os.path.exists(save_file):
        labeled_df = pd.read_csv(save_file)
        
        # Get indices of emails that have already been labeled
        merge_columns = [col for col in df.columns if col != 'Action_Item']
        global already_labeled
        # Mark rows that have already been labeled
        already_labeled = pd.merge(working_df, labeled_df, on=merge_columns, how='inner').index 
        labeled = len(already_labeled)
        print(already_labeled)
        print(f"Found {labeled} already labeled emails")
        
        # Create fresh output file with headers
        if not already_labeled.empty:
            labeled_df.to_csv(save_file, index=False)
    else:
        # Create an empty file with headers
        working_df.head(0).to_csv(save_file, index=False)
        already_labeled = []
    
    # Filter out already labeled emails
    unlabeled_df = working_df.drop(already_labeled) if labeled > 0 else working_df
    
    # Shuffle the unlabeled emails randomly
    unlabeled_df = unlabeled_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    total_unlabeled = len(unlabeled_df)
    print(f"Starting labeling process with {total_unlabeled} unlabeled emails")
    
    # Process in batches
    for start in range(0, total_unlabeled, batch_size):
        batch = unlabeled_df.iloc[start + 1250:start + batch_size + 1250].copy()
        
        # Display batch information
        print("\n" + "="*20 + f" BATCH {start//batch_size + 1}/{(total_unlabeled-1)//batch_size + 1} " + "="*20)
        
        # Create a display version of the batch that hides the 'file' column
        display_columns = [col for col in batch.columns if col.lower() != 'file']
        
        # Display the entire batch table before any input
        print(tabulate(batch[display_columns], headers="keys", tablefmt="fancy_grid", showindex=True))
        
        # Get user input for each row
        for i, (idx, row) in enumerate(batch.iterrows()):
            # Display which email we're labeling now
            print(f"\nCurrently labeling Email {i+1} (shown in row {i} in the table above)")
            
            label = input(f"Email {start + i + 1 + labeled}/{total_unlabeled} - Enter action item (press Enter for 'No action'): ").strip()
            
            # Explicitly mark as "No action" if skipped
            batch.at[idx, "Action_Item"] = label if label else "No action"
        
        # Append newly labeled data to the CSV
        batch.to_csv(save_file, mode='a', header=False, index=False)
        
        print("\n✅ Batch saved.")
        continue_labeling = input("Continue to next batch? (y/n): ").lower()
        if continue_labeling != 'y':
            break

# Start labeling
label_batches(df)

RangeIndex(start=0, stop=225, step=1)
Found 225 already labeled emails
Starting labeling process with 517176 unlabeled emails

╒══════╤══════════════════════════════════╤════════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════╤═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╤═══════════════╕
│      │ From        

In [5]:
import pandas as pd 

df = pd.read_csv("labeled_enron.csv")
df2 = df[['Body', 'Action_Item']]
df2.to_csv("hf://datasets/DenverDawgs18/ActionItems/final.csv")

In [None]:
import pandas as pd
from tabulate import tabulate
import os
import numpy as np

# Load the existing labeled data
labeled_file = "labeled_enron.csv"

if not os.path.exists(labeled_file):
    print(f"Error: {labeled_file} not found. Please make sure the file exists.")
    exit()

# Load the CSV
df = pd.read_csv(labeled_file)

# Ensure Action_Item column exists
if "Action_Item" not in df.columns:
    print("Warning: Action_Item column not found in the CSV. Adding it now.")
    df["Action_Item"] = None

# Output file for updated data
save_file = "updated_labeled_enron.csv"

# Function to display and update labels
def update_labels(df, batch_size=20):
    # Make a working copy of the main dataframe
    working_df = df.copy()
    
    # Shuffle the emails randomly for review
    working_df = working_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    total_emails = len(working_df)
    print(f"Starting update process with {total_emails} labeled emails")
    
    # Create fresh output file with headers
    working_df.head(0).to_csv(save_file, index=False)
    
    updates_made = 0
    
    # Process in batches
    for start in range(0, total_emails, batch_size):
        batch = working_df.iloc[start:start + batch_size].copy()
        
        # Display batch information
        print("\n" + "="*20 + f" BATCH {start//batch_size + 1}/{(total_emails-1)//batch_size + 1} " + "="*20)
        
        # Create a display version of the batch that hides the 'file' column
        display_columns = [col for col in batch.columns if col.lower() != 'file']
        
        # Display the entire batch table before any input
        print(tabulate(batch[display_columns], headers="keys", tablefmt="fancy_grid", showindex=True))
        
        # Get user input for each row
        for i, (idx, row) in enumerate(batch.iterrows()):
            # Display which email we're updating now
            print(f"\nCurrently reviewing Email {i+1} (shown in row {i} in the table above)")
            
            # Show the current label
            current_label = row["Action_Item"] if pd.notna(row["Action_Item"]) else "No label"
            print(f"Current label: \"{current_label}\"")
            
            # Ask if user wants to update the label
            update_choice = input("Update this label? (y/n, default=n): ").strip().lower()
            
            if update_choice == 'y':
                new_label = input("Enter new label (press Enter for 'No action'): ").strip()
                # Explicitly mark as "No action" if skipped
                batch.at[idx, "Action_Item"] = new_label if new_label else "No action"
                updates_made += 1
                print(f"Label updated to: \"{batch.at[idx, 'Action_Item']}\"")
            else:
                print("Label unchanged.")
        
        # Append updated data to the CSV
        batch.to_csv(save_file, mode='a', header=False, index=False)
        
        print("\n✅ Batch saved.")
        continue_updating = input("Continue to next batch? (y/n, default=y): ").strip().lower()
        if continue_updating == 'n':
            break
    
    print(f"\nUpdate process completed. {updates_made} labels were updated.")
    print(f"Updated data saved to {save_file}")

# Prompt for batch size
try:
    batch_size = int(input("Enter batch size (default=1): ") or "1")
except ValueError:
    print("Invalid input. Using default batch size of 1.")
    batch_size = 1

# Start updating labels
update_labels(df, batch_size)

Starting update process with 1250 labeled emails

╒════╤═════════════════════════════╤═════════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╤════════════════════════════════════════════════════════════════════════════════════════════╕
│    │ From                        │ To                                                                          │ CC                               