<centre><title><h1>Toxic Comment Classification</h1></title></centre>

<br>
<center><div class="alert alert-block alert-danger" style="margin: 2em;line-height: 1.7em; font-family: Verdana;"><b style="font-size: 18px;">🛑 &nbsp; WARNING:</b><br><br><b>The dataset for this competition contains text that may be considered profane, vulgar, or offensive.</b><br></div></center>

# Imports

In [None]:
import os
import sys
import gc
import random
import re
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

from path import Path
from tqdm import tqdm
from PIL import Image
from collections import Counter
from wordcloud import WordCloud, STOPWORDS

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

import plotly.express as px
import plotly.figure_factory as ff
from plotly import graph_objs as go


# Utils

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)

In [None]:
path = "../input/jigsaw-toxic-comment-classification-challenge/"
train_data_path = 'train.csv.zip'
test_data_path = 'test.csv.zip'

train_df = pd.read_csv(path + train_data_path)
print(f'shape of train_dataset: {train_df.shape}')
print('\n\n')
print('========== Train Dataset ==========')
print('\n')
display(train_df.head())

# EDA

In [None]:
temp_df = pd.DataFrame()
toxicity = []
comment_type = []
count = []
for col in train_df.columns:
    if col not in ('id','comment_text'):
        toxicity.extend([col,col])
        comment_type.extend(train_df[col].value_counts().keys().tolist())
        count.extend(train_df[col].value_counts().values)

temp_df['toxicity'] = toxicity
temp_df['comment_type'] = np.array(comment_type,dtype=np.str)
temp_df['count'] = count

fig = px.bar(temp_df, x='toxicity', y='count', color='comment_type', title='Value Counts',
             color_discrete_sequence=['#1616A7','#FB0D0D'],width=500)
fig.show()

In [None]:
def preprocessing(text):
    text = re.sub(r" -", "", text)
    text = re.sub(r"\d+:\d+, \w+ \d+, \d+ \(\w+\)","",text) # to remove 21:51, January 11, 2016 (UTC)
    text = re.sub(r"\d+.+\d","",text)
    text = re.sub(r"\n"," ",text)
    text = re.sub(r"\\","",text)
    text = re.sub(r"\.\.\.|\.\."," ",text)
    text = re.sub(r":|_|#"," ",text)
    text = re.sub(r"@|\||\(|\)|!|:|;|\"","",text)
    text = re.sub(r"\.|\?|,"," ", text)
    return text



temp_df = train_df.copy()

temp_df['comment_text'] = temp_df['comment_text'].apply(preprocessing)
temp_df['tokenized_text'] = temp_df['comment_text'].apply(word_tokenize)
#temp_df['comment_text'] = temp_df['comment_text'].apply(lambda x: ' '.join(x.split()))

temp_df['len'] = temp_df['comment_text'].apply(len)

temp_df2 = pd.DataFrame()

temp_df2['len'] = temp_df['len'].value_counts().keys()
temp_df2['count'] = temp_df['len'].value_counts().values

temp_df2 = temp_df2.sort_values('len')

fig = go.Figure()
fig.add_trace(go.Scatter(x=temp_df2['len'].values,y=temp_df2['count'].values,
                         mode='lines', name='len'))
fig.show()

In [None]:
toxic = temp_df[temp_df["toxic"] == 1].reset_index(drop=True)
severe_toxic = temp_df[temp_df["severe_toxic"] == 1].reset_index(drop=True)
obscene = temp_df[temp_df["obscene"] == 1].reset_index(drop=True)
threat = temp_df[temp_df["threat"] == 1].reset_index(drop=True)
insult = temp_df[temp_df["insult"] == 1].reset_index(drop=True)
identity_hate = temp_df[temp_df["identity_hate"] == 1].reset_index(drop=True)

lemma = WordNetLemmatizer()
stem = SnowballStemmer("english")
def most_common(data,n,col):
    top = Counter([stem.stem(lemma.lemmatize(item)) for sublist in data[col] for item in sublist if item.lower() not in STOPWORDS])
    temp = pd.DataFrame(top.most_common(20))
    temp.columns = ["common_word","count"]
    return temp.style.background_gradient(cmap="Blues")
print("top 20 common words in each toxicity")
print('\n')
print('========== toxic ==========')
display(most_common(toxic, 20, 'tokenized_text'))

print('\n')
print('========== severe_toxic ==========')
display(most_common(severe_toxic, 20, 'tokenized_text'))

print('\n')
print('========== obscene ==========')
display(most_common(obscene, 20, 'tokenized_text'))

print('\n')
print('========== threat ==========')
display(most_common(threat, 20, 'tokenized_text'))

print('\n')
print('========== insult ==========')
display(most_common(insult, 20, 'tokenized_text'))

print('\n')
print('========== identity_hate ==========')
display(most_common(identity_hate, 20, 'tokenized_text'))
print('\n')

In [None]:
def similer_color_func(word=None, font_size=None,
                       position=None, orientation=None,
                       font_path=None, random_state=None):
    h=24
    s=100
    l=random_state.randint(30,70)
    return f'hsl({h}, {s}%, {l}%)'

text = [stem.stem(lemma.lemmatize(item.lower())) for sublist in toxic['tokenized_text'] for item in sublist if item.lower() not in STOPWORDS]
text = ' '.join(text)

mask = np.array(Image.open('../input/images/butterfly-shadow-animal-icon-silhouettes-isolated-dark-black-graphical-white-background-184947266.jpg'))
wc = WordCloud(mask=mask,
              background_color='black',
              max_words=2000,
              stopwords=STOPWORDS,
              max_font_size=256,
              width=mask.shape[1],
              height=mask.shape[0],
              collocations=False, 
              font_path="../input/font-style/SalmaAlfasans-Light.otf",
              color_func=similer_color_func)

wc.generate(text)
fig = plt.figure(figsize=(15, 15))
plt.title("- Most Common Words within Toxic -",
           size=22, weight="bold")
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
def similer_color_func(word=None, font_size=None,
                       position=None, orientation=None,
                       font_path=None, random_state=None):
    h=24
    s=100
    l=random_state.randint(30,70)
    return f'hsl({h}, {s}%, {l}%)'

text = [stem.stem(lemma.lemmatize(item.lower())) for sublist in severe_toxic['tokenized_text'] for item in sublist if item.lower() not in STOPWORDS]
text = ' '.join(text)

mask = np.array(Image.open('../input/images/20854.Jpg'))
wc = WordCloud(mask=mask,
              background_color='black',
              max_words=2000,
              stopwords=STOPWORDS,
              max_font_size=256,
              width=mask.shape[1],
              height=mask.shape[0],
              collocations=False,  
              font_path="../input/font-style/SalmaAlfasans-Light.otf",
              color_func=similer_color_func)

wc.generate(text)
fig = plt.figure(figsize=(15, 15))
plt.title("- Most Common Words within Severe toxic -",
           size=22, weight="bold")
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
def similer_color_func(word=None, font_size=None,
                       position=None, orientation=None,
                       font_path=None, random_state=None):
    h=24
    s=100
    l=random_state.randint(60,100)
    return f'hsl({h}, {s}%, {l}%)'

text = [stem.stem(lemma.lemmatize(item.lower())) for sublist in obscene['tokenized_text'] for item in sublist if item.lower() not in STOPWORDS]
text = ' '.join(text)

mask = np.array(Image.open('../input/images/istockphoto-858216614-612x612.jpg'))
wc = WordCloud(mask=mask,
              background_color='black',
              max_words=2000,
              stopwords=STOPWORDS,
              max_font_size=256,
              width=mask.shape[1],
              height=mask.shape[0],
              collocations=False,  
              font_path="../input/font-style/SalmaAlfasans-Light.otf",
              color_func=similer_color_func)

wc.generate(text)
fig = plt.figure(figsize=(15, 15))
plt.title("- Most Common Words within Obscene -",
           size=22, weight="bold")
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
def similer_color_func(word=None, font_size=None,
                       position=None, orientation=None,
                       font_path=None, random_state=None):
    h=24
    s=100
    l=random_state.randint(30,70)
    return f'hsl({h}, {s}%, {l}%)'

text = [stem.stem(lemma.lemmatize(item.lower())) for sublist in threat['tokenized_text'] for item in sublist if item.lower() not in STOPWORDS]
text = ' '.join(text)

mask = np.array(Image.open('../input/images/Picsart_22-05-18_16-33-42-111.jpg'))
wc = WordCloud(mask=mask,
              background_color='black',
              max_words=2000,
              stopwords=STOPWORDS,
              max_font_size=256,
              width=mask.shape[1],
              height=mask.shape[0],
              collocations=False,  
              font_path="../input/font-style/SalmaAlfasans-Light.otf",
              color_func=similer_color_func)

wc.generate(text)
fig = plt.figure(figsize=(15, 15))
plt.title("- Most Common Words within Threat -",
           size=22, weight="bold")
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
def similer_color_func(word=None, font_size=None,
                       position=None, orientation=None,
                       font_path=None, random_state=None):
    h=24
    s=100
    l=random_state.randint(30,70)
    return f'hsl({h}, {s}%, {l}%)'

text = [stem.stem(lemma.lemmatize(item.lower())) for sublist in insult['tokenized_text'] for item in sublist if item.lower() not in STOPWORDS]
text = ' '.join(text)

mask = np.array(Image.open('../input/images/red-kite-rapture-black-silhouette-cut-out-and-isolated-on-a-white-background-2C4B3E9.jpg'))
wc = WordCloud(mask=mask,
              background_color='black',
              max_words=2000,
              stopwords=STOPWORDS,
              max_font_size=256,
              width=mask.shape[1],
              height=mask.shape[0],
              collocations=False,  
              font_path="../input/font-style/SalmaAlfasans-Light.otf",
              color_func=similer_color_func)

wc.generate(text)
fig = plt.figure(figsize=(15, 15))
plt.title("- Most Common Words within Insult -",
           size=22, weight="bold")
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
def similer_color_func(word=None, font_size=None,
                       position=None, orientation=None,
                       font_path=None, random_state=None):
    h=24
    s=100
    l=random_state.randint(30,70)
    return f'hsl({h}, {s}%, {l}%)'

text = [stem.stem(lemma.lemmatize(item.lower())) for sublist in identity_hate['tokenized_text'] for item in sublist if item.lower() not in STOPWORDS]
text = ' '.join(text)

mask = np.array(Image.open('../input/images/2d27acd5d288284587e13b0411e6e48a.jpg'))
wc = WordCloud(mask=mask,
              background_color='black',
              max_words=500,
              stopwords=STOPWORDS,
              max_font_size=256,
              width=mask.shape[1],
              height=mask.shape[0],
              collocations=False,  
              font_path="../input/font-style/SalmaAlfasans-Light.otf",
              color_func=similer_color_func)

wc.generate(text)
fig = plt.figure(figsize=(15, 15))
plt.title("- Most Common Words within Identity hate -",
           size=22, weight="bold")
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()