<a href="https://colab.research.google.com/github/Bambani2003/Data_Science_Projects/blob/main/NLP/Comment_Toxicity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Using NLP to identify toxic comments.**

In [None]:
# Install the required modules

!pip install gradio jinja2

In [None]:
# Import the required modules

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
import matplotlib.pyplot as plt
import gradio as gr

In [None]:
# Using GPU with TensorFlow

device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))

**EDA**

In [None]:
# Load the datasets

df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSets/Toxic_Comments/train.csv')
display(df.head())

**Preprocessing**

In [None]:
# Creating our training data

x_train = df['comment_text'].values
y_train = df[df.columns[2:]].values
display(x_train)
display(y_train)

In [None]:
# Create our vectoriser to tokenise the words

vectorizer = TextVectorization(max_tokens=len(x_train), output_sequence_length=1500, output_mode='int')
vectorizer.adapt(x_train)
vectorizer.get_vocabulary()

In [None]:
# Looking at the tokenised comments

vectorized_text = vectorizer(x_train)
vectorized_text

In [None]:
# Pipelining the training data for our model

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y_train))
dataset = dataset.cache()
dataset = dataset.shuffle(150000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)
dataset.as_numpy_iterator().next()

In [None]:
# Create CV and Test data pipeline

train = dataset.take(int(len(dataset)*0.7))
cross_val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

**MACHINE LEARNING**

In [None]:
# Building the model

with tf.device(device_name):
    model = Sequential()
    model.add(Embedding(len(x_train)+1, 32))
    model.add(Bidirectional(LSTM(32, activation='tanh')))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(96, activation='relu'))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(6, activation='sigmoid'))
    model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
# Running the model

history = model.fit(train, epochs=10, validation_data=cross_val)

In [None]:
# Plot for the training and CV loss (No use for only 1 epoch)

plt.figure(figsize = (8, 5))
pd.DataFrame(history.history).plot()
plt.show()

**EVALUTAION**

In [None]:
# Calculating Precision, Recall and Overall Accuracy of the model

pre = Precision()
re = Recall()
acc = CategoricalAccuracy()
for batch in dataset.as_numpy_iterator():
    X_true, y_true = batch
    yhat = model.predict(X_true)
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

**PREDICTIONS**

In [None]:
# Given an input, check if it is some form of toxic

text = input()
input_text = vectorizer(text)
res = model.predict(np.expand_dims(input_text, 0))
(res > 0.5).astype(int)
batch_X, batch_y = test.as_numpy_iterator().next()
(model.predict(batch_X) > 0.5).astype(int)
res.shape

**TESTING using UI**

In [None]:
# Save the model for later use

model.save('NLP_Comment_Toxicity.h5')
model = tf.keras.models.load_model('NLP_Comment_Toxicity.h5')

In [None]:
# Checking the expanded dimensional output

input_str = vectorizer('I freaking hate you!')
res = model.predict(np.expand_dims(input_str,0))
res

In [None]:
# Using Gradio and HuggingFace to create a UI

def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)

    return text
interface = gr.Interface(fn=score_comment, inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'), outputs='text')
interface.launch(share=True)