# Installing and importing Dependencies

In [None]:
!pip install tensorflow pandas matplotlib sklearn
# !pip install tensorflow-gpu
#since the tensorflow-gpu is now removed and after tf2.1, the gpu is merged to the tensorflow package

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

## Importing dataset
- data is from kaggle - https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data

In [None]:
# loading the dataset
df = pd.read_csv("dataset/train.csv")
df.head()

In [None]:
df.iloc[6]['comment_text']

In [None]:
df[df.columns[2:]].iloc[6]

## Preprocessing
- Batch processing is used

In [None]:
from tensorflow.keras.layers import TextVectorization

- Text vectorization maps text features to integer sequences

In [None]:
# divide the data into features and target
X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
# feature
X

In [None]:
# target
y

In [None]:
#limiting the number of words allowed
MAX_FEATURES = 200000

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
 output_sequence_length=1800,output_mode='int')

In [None]:
type(X)

In [None]:
type(X.values)
# X.values convert the X into a numpy array that can be passed as tensorflow

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorizer.get_vocabulary()

In [None]:
vectorizer("Hello world")

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
vectorized_text

In [None]:
# tensor flow data pipeline 
# MCSHBAP - map, cache, shuffle, batch, prefetch from_tensor_slices, list_file 
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16) #produces batches of 16 datasets
dataset = dataset.prefetch(8) #helps preventing bottle neck 

In [None]:
# this gives the batch of 16 samples
batch_X,batch_y=dataset.as_numpy_iterator().next()

In [None]:
batch_X.shape
# it is 1800 since we limited the size 
# output_sequence_length=1800

In [None]:
batch_y.shape

In [None]:
# seperating dataset
# the split is in ratio of 7:2:1 train:validation:test
train = dataset.take(int(len(dataset)*0.7))
validation = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [None]:
len(train), len(validation), len(test)

In [None]:
train_generator = train.as_numpy_iterator()
train_generator.next()

## Building Deep learning model
first layer is embedding - personality test for words

### Creating Sequential Model

In [None]:
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import LSTM, Dropout, Bidirectional,Dense, Embedding

In [None]:
model = Sequential()
# embedding - for each word we will embedd its personality into 32 features
model.add(Embedding(MAX_FEATURES+1,32))
# 32 different LSTM units of birdirectional means we can have both forward and backward pass Neural Network Layer to update the weights
# and activation function used is tanh since its directed by the tensorflow in gpu computing
model.add(Bidirectional(LSTM(32,activation='tanh')))

#feature extracot fully connected layers
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))

#final layer and have the shape of target
model.add(Dense(6,activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy',optimizer="Adam")

#BinaryCrossentropy is used since we are using 6 different binary classifier and to reduce loss for cases where one classifier affects another

In [None]:
model.summary()

In [None]:
history = model.fit(train,epochs=1,validation_data=validation)

In [None]:
history.history