In [17]:
import pandas as pd
import contractions
import emoji
import re
import unicodedata
import nltk
import spacy
from tqdm import tqdm

In [18]:
# Loading data from source 
df = pd.read_csv(r"C:\Playground\Toxicity_Classification\data\training\train.csv")

In [19]:
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [20]:
# dropping irrelevant features
df.drop('id', axis=1, inplace=True)

In [21]:
# Checking for duplicate instances
print(f"There are {df.duplicated().sum()} duplicate instaces")

There are 0 duplicate instaces


In [22]:
# Checking for null values
df.isna().sum().sum()

np.int64(0)

### Lower Case

In [23]:
# Making a deep copy of our data
df_copy = df.copy(deep=True)

In [24]:
def to_lowercase(text):
    """
    Convert the input text to lowercase.

    Parameters
    ----------
    text : str
        The text to convert.

    Returns
    -------
    str
        The text in lowercase.
    """
    return text.lower()


df_copy["preprocessed_text"] = df_copy["comment_text"].apply(to_lowercase)

### Removing links

In [25]:
def remove_urls(text):
    """
    Remove URLs from the input text using a regular expression.

    Parameters
    ----------
    text : str
        The input text from which URLs will be removed.

    Returns
    -------
    str
        The text with all URLs removed.
    """
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)


# Apply the function to each text entry
df_copy['preprocessed_text'] = df_copy['preprocessed_text'].apply(remove_urls)


### Expanding Contractions

In [26]:
def expand_contractions(text):
    """
    Expand contractions in the input text.

    Parameters
    ----------
    text : str
        The text containing contractions to expand.

    Returns
    -------
    str
        The text with all contractions expanded.
    """
    return contractions.fix(text)


df_copy["preprocessed_text"] = df_copy["preprocessed_text"].apply(expand_contractions)


### Removing Accents/Diacritics

In [27]:
def remove_accents_diacritics(text):
    """
    Remove accents and diacritics from the input text.

    Parameters
    ----------
    text : str
        The text to normalize.

    Returns
    -------
    str
        The text with all accents and diacritics removed.
    """
    text = unicodedata.normalize('NFKD', text)
    text = ''.join([c for c in text if not unicodedata.combining(c)])
    return text


df_copy['preprocessed_text'] = df_copy['preprocessed_text'].apply(remove_accents_diacritics)


### De-emojify the emojies into respective text

In [28]:
def convert_emojis(text):
    """
    Convert emojis in the input text into descriptive text labels.

    Parameters
    ----------
    text : str
        The text containing emojis to convert.

    Returns
    -------
    str
        The text with emojis replaced by their descriptive names.
    """
    return emoji.demojize(text)


df_copy['preprocessed_text'] = df_copy['preprocessed_text'].apply(convert_emojis)


### Removing mentions to other users

In [None]:
def remove_mentions(text: str) -> str:
    """
    Remove @mentions from the input text and normalize spacing.

    Parameters
    ----------
    text : str
        Input text that may contain @mentions.

    Returns
    -------
    str
        Text with all @mentions removed and extra spaces cleaned.
    """
    text = re.sub(r'@[A-Za-z0-9_.-]+', '', text)
    return " ".join(text.split())


### Removing numbers and punctuation

In [29]:
nlp = spacy.load("en_core_web_sm")

def spacy_remove_punct_numbers_pipe(text_list):
    """
    Clean a list of text documents using spaCy by removing punctuation and numbers.

    This function processes texts efficiently using spaCy's `nlp.pipe()` which supports
    batching and multiprocessing. It removes:
        - punctuation tokens
        - numeric tokens
        - any tokens that are not purely alphabetic

    Each token that passes the filter is lowercased and joined back into a space-
    separated cleaned string.

    Parameters
    ----------
    text_list : list of str
        A list containing text documents (one per row of your dataframe).

    Returns
    -------
    list of str
        A list of cleaned text strings corresponding to each input document, with
        punctuation and numbers removed.

    """
    
    total_docs = len(text_list)
    cleaned = []

    for doc in tqdm(
        nlp.pipe(text_list, batch_size=500, n_process=-1),
        total=total_docs,
        desc="Cleaning text (punct+numbers)",
        colour="green",
        ncols=100
    ):
        tokens = [token.text.lower() for token in doc if token.is_alpha]
        cleaned.append(" ".join(tokens))

    return cleaned


# Applying on column
df_copy["preprocessed_text"] = spacy_remove_punct_numbers_pipe(
    df_copy["preprocessed_text"].tolist()
)


Cleaning text (punct+numbers): 100%|[32m███████████████████████[0m| 159571/159571 [08:07<00:00, 327.39it/s][0m


### Lemmetization

In [30]:
def spacy_lemmatize_pipe(text_list):
    """
    Lemmatize a list of text documents using spaCy's optimized processing pipeline.

    This function applies lemmatization to each text in `text_list` using spaCy's
    `nlp.pipe()`, which supports efficient batched and parallel processing.
    Every token in each document is replaced with its lemma form, and the lemmas
    are joined back into a space-separated string.

    Parameters
    ----------
    text_list : list of str
        The input list of text documents to lemmatize.

    Returns
    -------
    list of str
        A list of lemmatized text strings, one for each input document.


    """
    
    total_docs = len(text_list)
    lemmatized = []

    for doc in tqdm(
        nlp.pipe(text_list, batch_size=500, n_process=-1),
        total=total_docs,
        desc="Lemmatizing with spaCy",
        colour="green",
        ncols=100
    ):
        lemmas = [token.lemma_ for token in doc]
        lemmatized.append(" ".join(lemmas))

    return lemmatized


# Apply on column
df_copy["preprocessed_text"] = spacy_lemmatize_pipe(
    df_copy["preprocessed_text"].tolist()
)


Lemmatizing with spaCy: 100%|[32m██████████████████████████████[0m| 159571/159571 [06:18<00:00, 421.78it/s][0m


### Converting Raw text into Tokens

In [31]:
def spacy_tokenize_pipe(text_list):
    """
    Tokenize a list of text documents using spaCy's high-performance pipeline.

    This function efficiently tokenizes each document in `text_list` by leveraging
    spaCy's `nlp.pipe()` for parallel, batched processing. It returns each document
    as a list of raw token strings.

    Parameters
    ----------
    text_list : list of str
        List of input text documents (e.g., each row of a DataFrame column).

    Returns
    -------
    list of list of str
        A list where each element corresponds to one document and contains the
        tokenized word strings.

    """

    total_docs = len(text_list)
    tokenized = []

    for doc in tqdm(
        nlp.pipe(text_list, batch_size=500, n_process=-1),
        total=total_docs,
        desc="Tokenizing with spaCy",
        colour="green",
        ncols=100
    ):
        tokenized.append([token.text for token in doc])

    return tokenized


# apply on the whole column
df_copy["preprocessed_text"] = spacy_tokenize_pipe(df_copy["preprocessed_text"].tolist())


Tokenizing with spaCy: 100%|[32m███████████████████████████████[0m| 159571/159571 [06:02<00:00, 440.67it/s][0m


In [32]:
df_copy

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, why, the, edit, make, under, my,..."
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[he, match, this, background, colour, I, be, s..."
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, I, be, really, not, try, to, edit, ..."
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[more, I, can, not, make, any, real, suggestio..."
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[you, sir, be, my, hero, any, chance, you, rem..."
...,...,...,...,...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,"[and, for, the, second, time, of, ask, when, y..."
159567,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,"[you, should, be, ashamed, of, yourself, that,..."
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,"[spitzer, umm, there, be, no, actual, article,..."
159569,And it looks like it was actually you who put ...,0,0,0,0,0,0,"[and, it, look, like, it, be, actually, you, w..."
