For training your model youâ€™ll typically use the labeled data. In this case, the main file to focus on is:

train.csv (File #8)
The other files may be used for:

Evaluation or further analysis:
For instance, the test files (Files #4, #5, and #6) can be used for testing or validation.

Additional features or biases:
Files #2 and #7 (identity_individual_annotations.csv and toxicity_individual_annotations.csv) can help address bias or incorporate additional features.

Submission template:
File #3 (sample_submission.csv) is usually for organizing your submission.

Complete dataset:
File #1 (all_data.csv) may offer more extensive coverage, depending on your intended use case.

In [None]:
import os
import pandas as pd
import gdown
import string
import re
import nltk
import spacy
from tqdm.notebook import tqdm

# Download NLTK resources if not already available
nltk.download('punkt')

# Load spaCy model (make sure to run: python -m spacy download en_core_web_sm)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])

def vectorized_clean(series):
    """
    Use vectorized operations to lowercase and remove punctuation.
    """
    pattern = f"[{re.escape(string.punctuation)}]"
    return series.str.lower().str.replace(pattern, '', regex=True)

def spacy_tokenize(texts, batch_size=500):
    """
    Use spaCy's nlp.pipe to tokenize text in batches.
    """
    clean_texts = []
    for doc in tqdm(nlp.pipe(texts, batch_size=batch_size, n_process=1), total=len(texts)):
        tokens = [token.text for token in doc if not token.is_space]
        clean_texts.append(" ".join(tokens))
    return clean_texts

# Create directories if they don't exist
raw_dir = os.path.join("../raw_data", "jigsaw_unintended")
processed_dir = os.path.join("../processed_data", "jigsaw_unintended")
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Download train.csv using the direct download link
train_url = 'https://drive.google.com/uc?export=download&id=1N-orSYsJCubW2SXLXVukT9zfFf8aAg-C'
train_file = os.path.join(raw_dir, 'train.csv')
cleaned_train = os.path.join(processed_dir, 'cleaned_train.csv')

# Check if the file exists before downloading
if not os.path.exists(train_file):
    gdown.download(train_url, train_file, quiet=False)
else:
    print(f"{train_file} already exists; skipping download.")


# Check if the cleaned file already exists
if os.path.exists(cleaned_train):
    print(f"{cleaned_train} already exists; skipping processing.")
else:
    # Process the file in chunks if it's too large
    chunk_size = 10**6  # Adjust based on your memory capacity
    processed_chunks = []

    for chunk in pd.read_csv(train_file, chunksize=chunk_size):
        # Remove duplicates and fill missing values
        chunk.drop_duplicates(inplace=True)
        chunk.fillna('', inplace=True)
        
        if 'comment_text' in chunk.columns:
            # Vectorized cleaning: lowercase & remove punctuation
            chunk['comment_text'] = vectorized_clean(chunk['comment_text'])
            
            # Option 1: If tokenization isn't strictly needed here, comment out the next block.
            # Option 2: If tokenization is needed, use spaCy's nlp.pipe for improved performance.
            texts = chunk['comment_text'].tolist()
            chunk['comment_text'] = spacy_tokenize(texts, batch_size=500)
        
        processed_chunks.append(chunk)

    # Concatenate all processed chunks and save the cleaned data
    df = pd.concat(processed_chunks)
    cleaned_train = os.path.join(processed_dir, 'cleaned_train.csv')
    df.to_csv(cleaned_train, index=False)

    print(f"Cleaned data saved to {cleaned_train}")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/crownedprinz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


../raw_data/jigsaw_unintended/train.csv already exists; skipping download.


  chunk.fillna('', inplace=True)


  0%|          | 0/1000000 [00:00<?, ?it/s]

  chunk.fillna('', inplace=True)


  0%|          | 0/804874 [00:00<?, ?it/s]

Cleaned data saved to ../processed_data/jigsaw_unintended/cleaned_train.csv
