# Installing Libraries & Loading .csv Data

In [2]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

2024-05-06 11:54:00.857477: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 11:54:00.857593: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 11:54:00.860781: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-06 11:54:01.142661: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Had to work around a "Error tokenizing error", specifically modifying parser engine to python
data = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv'), engine="python")

In [4]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [22]:
print(data.shape)

(159571, 8)


In [25]:
# For this particular dataset, notice that each comment is uniquely identified with the 'id' column, but more importantly, each comment contains labels specifying different types of toxicities that a comment could have.

# Data Preprocessing

In [5]:
# Utilizing tensorflow's in-built vectorization tool to tokenize our data values.
from tensorflow.keras.layers import TextVectorization

In [6]:
# Splitting by comments and their corresponding toxicity labels
x = data['comment_text']
y = data[data.columns[2:]].values

In [7]:
# Higher -> possibly more accurate but slower model
MAX_WORDS = 200000

In [8]:
# Capping model to only take max 1800 tokens per comment
vectorizer = TextVectorization(max_tokens=MAX_WORDS,
                               output_sequence_length=1800,
                               output_mode='int')

2024-05-06 11:54:22.665684: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [9]:
# All words/tokens within our comments data is learned by the vectorizer
vectorizer.adapt(x.values)

# vectorized_text now contains the translated comments to our custom vectorizer (represented as integers)
vectorized_text = vectorizer(x.values)

In [10]:
# Creating data pipeline
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [19]:
# Creating traning, validation and testing dataset splits
# 70% training, 20% validation, 10% testing
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# Sequential (Deep Learning) Model

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [27]:
# Create base sequential model
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_WORDS+1, 32))
# Bidirectional (allows for two-way information passing, important for sentence cohesion in analysis) LSTM Layer, 'tanh' activation needed for LSTM layers
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer, 6 for the number of toxicity labels, sigmoid converts outputs to ranges between 0-1 
model.add(Dense(6, activation='sigmoid'))

In [28]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [29]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                      

In [None]:
model_training = model.fit(train, epochs=1, validation_data=val)



In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(8,5))
pd.DataFrame(model_training.history).plot()
plt.show()