In [None]:
import pandas as pd
import random
import os

# Define our categories
CATEGORIES = ["sexism", "racism", "violence", "appearance", "ability", "non-offensive"]

def create_multi_label_tool(data_path, output_file, num_samples=100):
    """Tool to help manually label comments with multiple labels"""
    # Load the data
    df = pd.read_csv(data_path)
    
    # Check what column contains the comments
    print(f"Columns in your dataset: {df.columns.tolist()}")
    
    # Ask for the column name
    text_column = input(f"Which column contains the comments? Enter a name from above: ")
    
    if text_column not in df.columns:
        print(f"Error: Column '{text_column}' not found.")
        return
    
    # Select random samples
    if len(df) > num_samples:
        sample_df = df.sample(num_samples, random_state=42)
    else:
        sample_df = df
    
    # Create a new dataframe for labels
    labeled_df = pd.DataFrame({
        'comment': sample_df[text_column]
    })
    
    # Add columns for each category (1 for selected, 0 for not selected)
    for category in CATEGORIES:
        labeled_df[category] = 0
    
    # Add a column to track if this comment has been labeled
    labeled_df['is_labeled'] = False
    
    # Check if output file exists
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        print(f"Found existing labeled data with {len(existing_df)} entries.")
        
        # Make sure all category columns exist in the existing data
        for category in CATEGORIES:
            if category not in existing_df.columns:
                existing_df[category] = 0
        
        # Add 'is_labeled' column if it doesn't exist
        if 'is_labeled' not in existing_df.columns:
            existing_df['is_labeled'] = existing_df[CATEGORIES].sum(axis=1) > 0
        
        # Add only new comments
        existing_comments = set(existing_df['comment'].tolist())
        new_comments = [comment for comment in labeled_df['comment'] if comment not in existing_comments]
        
        if new_comments:
            new_df = pd.DataFrame({'comment': new_comments})
            for category in CATEGORIES:
                new_df[category] = 0
            new_df['is_labeled'] = False
            labeled_df = pd.concat([existing_df, new_df], ignore_index=True)
        else:
            labeled_df = existing_df
            print("No new comments to label.")
    
    # Interactive labeling
    for idx, row in labeled_df.iterrows():
        # Skip if already labeled
        if row['is_labeled']:
            continue
        
        # Display the comment
        print("\n" + "="*80)
        print(f"Comment {idx+1}/{len(labeled_df)}:")
        print(row['comment'])
        print("="*80)
        
        # Show category options
        print("\nCategories (select multiple by entering numbers separated by spaces):")
        for i, category in enumerate(CATEGORIES):
            print(f"{i+1}. {category}")
        
        # Get labels
        while True:
            choice = input("\nEnter category numbers (or 's' to skip, 'q' to quit): ")
            
            if choice.lower() == 'q':
                labeled_df.to_csv(output_file, index=False)
                print(f"Progress saved to {output_file}.")
                print(f"Labeled {labeled_df['is_labeled'].sum()} comments.")
                return
            
            if choice.lower() == 's':
                break
            
            try:
                # Parse multiple selections
                selections = [int(num) for num in choice.split()]
                valid_selections = all(1 <= num <= len(CATEGORIES) for num in selections)
                
                if valid_selections:
                    # Reset all categories for this comment
                    for category in CATEGORIES:
                        labeled_df.at[idx, category] = 0
                    
                    # Set the selected categories
                    for num in selections:
                        category = CATEGORIES[num-1]
                        labeled_df.at[idx, category] = 1
                    
                    # Mark as labeled
                    labeled_df.at[idx, 'is_labeled'] = True
                    break
                else:
                    print(f"Invalid choice. Please enter numbers between 1 and {len(CATEGORIES)}.")
            except ValueError:
                print("Please enter numbers separated by spaces, 's', or 'q'.")
        
        # Save progress after each label
        labeled_df.to_csv(output_file, index=False)
    
    # Final save
    labeled_df.to_csv(output_file, index=False)
    print(f"Labeling complete! Saved to {output_file}")
    print(f"Labeled {labeled_df['is_labeled'].sum()} comments.")

if __name__ == "__main__":
    data_path = input("Enter the path to your dataset CSV file: ")
    output_file = input("Enter the path for saving labeled data (e.g., labeled_data.csv): ")
    num_samples = int(input("How many comments do you want to label? "))
    
    create_multi_label_tool(data_path, output_file, num_samples)

Columns in your dataset: ['video_id', 'comment']
Found existing labeled data with 100 entries.

Comment 97/192:
Stream full match DAZN, open all territories, look they all so wet 🥹

Categories (select multiple by entering numbers separated by spaces):
1. sexism
2. racism
3. violence
4. appearance
5. ability
6. non-offensive

Comment 101/192:
Brilliant 👍😊

Categories (select multiple by entering numbers separated by spaces):
1. sexism
2. racism
3. violence
4. appearance
5. ability
6. non-offensive

Comment 102/192:
I love them all, but I'm missing Caroline Graham Hansen's goal against Chelsea!

Categories (select multiple by entering numbers separated by spaces):
1. sexism
2. racism
3. violence
4. appearance
5. ability
6. non-offensive

Comment 103/192:
I definitely think they should make more these types of series for both the men and women games.

Categories (select multiple by entering numbers separated by spaces):
1. sexism
2. racism
3. violence
4. appearance
5. ability
6. non-offen

: 