# Importing the dataset

In [78]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [101]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [102]:
filepath = '/content/drive/MyDrive/dataset_proyecto3.csv'
df  = pd.read_csv(filepath)

In [103]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,simple_toxic
0,5f140a8f41763bad,"""== A barnstar for you! ==\r\n\r\n The Minor ...",0,0,0,0,0,0,0
1,c57b608d96f057f2,"""==Glenn Beck Hoax==\r\nThis topic should be d...",0,0,0,0,0,0,0
2,1d9867f0f60ba9bd,"""\r\n\r\n Date of establishment \r\n\r\nCharle...",0,0,0,0,0,0,0
3,17eb2a66bfce1023,==AfD nomination of Untitled Chilli Album== \r...,0,0,0,0,0,0,0
4,b8324615a1bcaf58,scum who look up to Che deserve the worst kind...,1,0,1,1,1,0,1


#Preprocess

In [104]:
from tensorflow.keras.layers import TextVectorization

In [105]:
#Variables dependientes e independientes
X = df['comment_text']
y = df[df.columns[2:]].values

In [106]:
#Cantidad maximas de palabras
MAX_FEATURES = 200000

In [107]:
#Vectorizamos las palabras de las frases, en un conjunto de ints
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [108]:
vectorizer.adapt(X.values)

In [109]:
vectorized_text = vectorizer(X.values)

In [110]:
#Optimizamos el dataset
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [111]:
#Obtenemos los distitnos 
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# Create LSTM

In [112]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [113]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(7, activation='sigmoid'))

In [114]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam',metrics=['accuracy'])

In [80]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 7)                 903       
                                                      

In [None]:
history = model.fit(train, epochs=1, validation_data=val)



In [81]:
from matplotlib import pyplot as plt

In [82]:
history.history

{'accuracy': [0.25504857301712036],
 'loss': [0.10808716714382172],
 'val_accuracy': [0.032258063554763794],
 'val_loss': [0.0722779929637909]}

#Make Predictions

In [84]:
input_text = vectorizer(['You freaking suck! I am going to hit you.'])

In [85]:
res = model.predict(input_text)

In [86]:
(res > 0.5).astype(int)

array([[1, 0, 1, 0, 0, 0, 1]])

#Tests

In [87]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [88]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [89]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [90]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.7902946472167969, Recall:0.6523604989051819, Accuracy:0.28776979446411133


In [92]:
import tensorflow as tf

In [93]:
model.save('toxicity.h5')

In [94]:
model = tf.keras.models.load_model('toxicity.h5')

In [95]:
input_str = vectorizer('hello!')

In [96]:
res = model.predict(np.expand_dims(input_str,0))

In [97]:
res

array([[0.04491901, 0.0004732 , 0.0232141 , 0.00072953, 0.01400644,
        0.00868499, 0.05356401]], dtype=float32)

In [98]:
input_str = vectorizer('fuck you!')

In [99]:
res = model.predict(np.expand_dims(input_str,0))

In [100]:
res

array([[0.930926  , 0.10078266, 0.8486693 , 0.01088205, 0.67878467,
        0.09543145, 0.92155576]], dtype=float32)