# CSCK507 Mid-Module Assignment
### Toxic comment classification challenge

## Table of Contents
[Section 1. Introduction](#introduction)
- [Import Dependencies](#import-dependencies)

[Section 2. Data Exploration & Analysis](#data-exploration-&-analysis)
  - [2.1 Data Preprocessing](#data-preprocessing)
  - [2.2 Data Imbalance](#data-imbalance)

## 1. Introduction 

### Importing Dependencies

In [20]:
import pandas as pd 
import numpy as np

# For Data Preprocessing
from imblearn.over_sampling import RandomOverSampler
from nltk.corpus import stopwords
import nltk
import re  

# For Visualisation
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

#For Feature Extraction  
import spacy
import string 
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer   
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
# Load the dataset into pandas DataFrame with relative path

df = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test_labels.csv')
df_testcomments = pd.read_csv('./test.csv')

In [22]:
try:
    spacy.prefer_gpu()
    spacy.load('en_core_web_sm')
except LookupError:
    print('Run: python -m spacy download en_core_web_sm')

try:
    nltk_stop = stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

In [23]:
# Initialise SpaCy Model 
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')

## 2. Data Exploration & Analysis

In [24]:
class_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for column in class_columns:
    class_counts = df[column].value_counts()
    print(class_counts)
    print()

toxic
0    144277
1     15294
Name: count, dtype: int64

severe_toxic
0    157976
1      1595
Name: count, dtype: int64

obscene
0    151122
1      8449
Name: count, dtype: int64

threat
0    159093
1       478
Name: count, dtype: int64

insult
0    151694
1      7877
Name: count, dtype: int64

identity_hate
0    158166
1      1405
Name: count, dtype: int64



In [25]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
None


In [27]:
# obtain class labels of the dataset
class_labels = list(df.columns[2:])
class_labels

# remove rows with -1 from df_test as it is not used for scoring
print(f'Before removing -1: {df_test.shape}')
for class_label in class_labels:
    df_test = df_test[df_test[class_label] != -1]
print(f'After removing -1: {df_test.shape}')

# left join 'df_test' and 'df_testcomments' on 'id' column
df_test = pd.merge(df_test, df_testcomments, on='id', how='left')

# rearraange columns to be the same as df
df_test = df_test[['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

Before removing -1: (153164, 7)
After removing -1: (63978, 7)


In [28]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
None


### 2.1 Data Preprocessing

In [32]:
def preprocess_text(text):
    """
    Clean and preprocess a text string.

    Operations performed:
    - Replace special characters, URLs, and numbers with spaces.
    - Remove extra spaces and replace "\n" with a space.
    - Remove Non-English characters.
    - Remove start and end white spaces.
    - Remove single characters.
    - Remove punctuations.
    - Convert the text to lowercase.
    - Remove common stopwords.

    :param text: Input text (string).
    :return: Cleaned text (string).

    Example:
    >>> input_text = "An example text with special characters: $100 and URLs like https://example.com."
    >>> preprocess_text(input_text)
    'example text special characters URLs like'
    """
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    # Remove extra spaces and replace "\n" with a space
    text = re.sub("\s\s+", " ", text).replace("\n", " ")
    # Remove Non-English characters
    text = re.sub(r'[^\x00-\x7F]+', "", text)
    # Remove start and end white spaces
    text = text.strip()
    # Remove single characters
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
    # Remove punctuations
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    # Lowercase the text
    text = text.lower()
    # Stopword Removal
    text = ' '.join([word for word in text.split() if word not in nltk_stop])

    return text

In [33]:
def tokenize_text(documents):
    """
    Tokenize a list of documents and perform the following:
    1. Break text into individual words or subword tokens.
    2. Reduce words to their base or root form using lemmatization.
    3. Remove stop words and non-alphabetic characters.

    Using spaCy's nlp.pipe to batch process texts and yield Doc objects.

    :param documents: List of strings representing documents.
    :return: List of lists of strings, where each list corresponds to the lemmatized tokens of a document.

    Example:
    >>> input_documents = ["Tokenize this document.", "And tokenize another one."]
    >>> tokenize_text(input_documents)
    [['tokenize', 'document'], ['tokenize']]
    """
    lemma_list = []
    # Disable "ner" and "parser" components for faster processing
    for doc in nlp.pipe(documents, disable=["ner", "parser"], batch_size=1000):
        # Generate lemmatized tokens
        lemmatized_tokens = [token.lemma_ for token in doc]
        # Remove stop words and non-alphabetic characters
        lemmatized_tokens = [token for token in lemmatized_tokens
                             if token not in nlp.Defaults.stop_words
                             and token.isalpha()]
        lemma_list.append(lemmatized_tokens)

    return lemma_list


In [31]:
# Apply preprocessing to train data
df['comment_text'] = df['comment_text'].apply(preprocess_text)
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i m se...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m really not trying to edit war it s...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more can t make any real suggestions on impro...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
5,00025465d4725e87,congratulations from me as well use the tools...,0,0,0,0,0,0
6,0002bcb3da6cb337,cocksucker before you piss around on my work,1,1,1,0,1,0
7,00031b1e95af7921,your vandalism to the matt shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,sorry if the word nonsense was offensive to yo...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [None]:
    # Create a TF-IDF (similar to bag of words but it provides importance based on word)