# Final Project: Toxic Chat Classification Using NLP

## Introduction
In this project, we will be using Natural Language Processing (NLP) to classify toxic comments in a chat. The dataset we will be using is the 
1. ) [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) dataset from Kaggle. The dataset consists of comments from Wikipedia’s talk page edits. The comments are labeled as toxic, severe toxic, obscene, threat, insult, and identity hate. The goal of this project is to build a model that can classify the comments into these categories.
2. ) [Sensai] is a toxic chat dataset consists of live chats from Virtual YouTubers' live streams
   curl -L -o ~/Downloads/archive.zip\ https://www.kaggle.com/api/v1/datasets/download/uetchy/sensai
3. ) [The Toxicity Dataset] This repo contains 500 toxic and 500 non-toxic comments from a variety of popular social media platforms. https://github.com/surge-ai/toxicity

## Members - Cluster D Table 25
1. ) HTUN HTET MYAT
2. )
3. )
4. )
5. )

In [17]:
# Import Statements 
import os
import re
import torch
import requests
import nltk
import string
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, Dataset
import sentencepiece
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from symspellpy import SymSpell, Verbosity
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
lemmatizer= WordNetLemmatizer()
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /Users/blake/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/blake/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:
# Default Functions to clean the NLP Dataset, remove stopwords, lemmatize, and stem the words
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None
    
def tagged_lemma(string):
    
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(string))

    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

    lemmatized_sentence = []

    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:       
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    lemmatized_sentence = " ".join(lemmatized_sentence)
    return lemmatized_sentence


def clean_string(text, stem="None"):

    # Final String to return
    final_string = ""

    # Make the text to be lower case
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Remove punctuations
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [21]:
# Start with Cleaning the Data of Kaggle Toxic Comment Dataset

df_kaggle = pd.read_csv('./data/kaggle-toxic-comment-challange/kaggle_train.csv')
df_sensai = pd.concat([pd.read_parquet(x) for x in glob.glob('./data/sensai/*.parquet')], ignore_index=True)
df_surge = pd.read_csv('./data/surge-ai-toxicity-repo/toxicity_en.csv')

## Pretty print alll the dataframes columns
print(df_kaggle.columns)
print(df_sensai.columns)
print(df_surge.columns)


Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')
Index(['body', 'label'], dtype='object')
Index(['text', 'is_toxic'], dtype='object')


In [None]:
# Clean all the datasets and put them in the cleaned column
df_kaggle['comment_text_cleaned'] = df_kaggle['comment_text'].apply(lambda x: clean_string(x, stem="Lem"))
df_sensai['body_cleaned'] = df_sensai['body'].apply(lambda x: clean_string(x, stem="Lem"))
df_surge['text_cleaned'] = df_surge['text'].apply(lambda x: clean_string(x, stem="Lem"))

In [None]:
# Save the dataframes with the cleaned data in CSV for backup
df_kaggle.to_csv('./data/kaggle_train_cleaned.csv', index=False)
df_sensai.to_csv('./data/sensai_cleaned.csv', index=False)
df_surge.to_csv('./data/toxicity_en_cleaned.csv', index=False)

