# Modelo de clasificación de textos con TensorFlow

In [None]:
import requests
import json
import pandas as pd
import numpy as np
import re
import io
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub

### Importacion del Dataset

In [None]:
url = "https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json"
filename = "sarcasm.json"

response = requests.get(url)
response.raise_for_status()

with open(filename, "wb") as file:
    file.write(response.content)

# Load the JSON file
with open(filename, 'r') as f:
    datastore = json.load(f)

# Accede a los datos en el archivo JSON, imprime el primer elemento
print(datastore[1889])

{'article_link': 'https://www.theonion.com/rugged-new-sport-utility-vehicle-takes-on-mall-parking-1819586239', 'headline': 'rugged new sport-utility vehicle takes on mall parking lot', 'is_sarcastic': 1}


In [None]:
df = pd.DataFrame(datastore)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


#### Division del dataset en datos de entrenamiento, validacion y test.
- Entrenamiento 80%
- Validacion 10%
- Test 10%


Se toma una muestra aleatorea del 100% de los datos

In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])
len(train), len(val), len(test)

(21367, 2671, 2671)

#### Convertir un Dataframe en un objeto dataset de Tensorfow

- Esta funcion se copió de la documentacion de estructura de datos de TensorFlow;
https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers?hl=es-419

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=104):
  df = dataframe.copy()
  labels = df.pop('is_sarcastic')
  df = df["headline"]
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [None]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

list(train_data)[1]

(<tf.Tensor: shape=(104,), dtype=string, numpy=
 array([b'is the gates foundation investing in the abuse of palestinian prisoners?',
        b'historical archives: to be sold - tri-cornered shoes',
        b'less than half of the money pledged to fight ebola reached affected countries',
        b"ctrl+ plus: a closer look at amc's halt and catch fire",
        b'in defense of the promposal', b'fussy eater 38',
        b'sunday roundup',
        b'thankful for our power: a thankful discourse in a time of reckoning',
        b"here's why you need to know broadway and tv star andy mientus",
        b'30 things about anxiety nobody talks about',
        b"'entourage' the movie -- who cares?",
        b'secretary of education under investigation for falsifying hall passes',
        b'martin shkreli faces rough stay in prison system where inmates who funded hair theft are lowest caste',
        b'nasa issues formal apology for 1969 genocide of moon natives',
        b"in the weeks before tru

#### Crear una capa de embedding con un modelo de texto pre-entrenado llamado "nnlm-en-dim50" Disponible en tensorflow hub

In [None]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)

In [None]:
hub_layer(list(train_data)[0][0])

<tf.Tensor: shape=(104, 50), dtype=float32, numpy=
array([[ 0.37682086, -0.15507877, -0.2031259 , ...,  0.29483482,
         0.11270191, -0.00884049],
       [ 0.09828453,  0.23514761, -0.1905469 , ..., -0.0322963 ,
        -0.11530775, -0.18959038],
       [ 0.20288342, -0.19590287,  0.01126989, ...,  0.05338063,
         0.34669214, -0.13541856],
       ...,
       [ 0.26411903, -0.22366555, -0.1783577 , ..., -0.22504963,
         0.19972871, -0.10343295],
       [ 0.24107094, -0.09137625, -0.37233743, ..., -0.07471686,
         0.28699568,  0.08105996],
       [ 0.0302416 ,  0.01768361,  0.18121883, ...,  0.0176788 ,
        -0.07933996,  0.23331033]], dtype=float32)>

#### Modelo de red neuronal secuencial

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.evaluate(train_data)



[0.68378084897995, 0.5719099640846252]

In [None]:
model.evaluate(valid_data)



[0.6833930611610413, 0.5649569630622864]

In [None]:
history = model.fit(train_data, epochs=3, validation_data=valid_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model.evaluate(test_data)



[0.3802497684955597, 0.8464994430541992]

Prueba prediciendo sentencias

In [None]:
# Supongamos que tienes una sentencia que deseas predecir

sentencia = 'ohio governor makes desperate plea to aquaman'

# Preprocesamiento de la sentencia
preprocessed_sentence = [sentencia]  # Convertir la sentencia en una lista de cadenas de texto

# Obtener la predicción del modelo
prediction = model.predict(preprocessed_sentence)

# Interpretar la predicción
if prediction[0] < 0.5:
    print("La sentencia no es sarcástica")
else:
    print("La sentencia es sarcástica")

La sentencia es sarcástica


In [None]:
mask  = df["headline"] == 'ohio governor makes desperate plea to aquaman'
df[mask]

Unnamed: 0,article_link,headline,is_sarcastic
11330,https://politics.theonion.com/ohio-governor-ma...,ohio governor makes desperate plea to aquaman,1


In [None]:
prediction[0]

array([0.66217107], dtype=float32)

En los siguientes ejemplos se utilizan titulares de noticias que no se encuentran en el dataset

In [None]:
# La siguiente sentencia no es sarcastica
sentencia = "police say kansas city shooting was the result of dispute"

# Preprocesamiento de la sentencia
preprocessed_sentence = [sentencia]  # Convertir la sentencia en una lista de cadenas de texto

# Obtener la predicción del modelo
prediction = model.predict(preprocessed_sentence)

# Interpretar la predicción
if prediction[0] < 0.5:
    print("La sentencia no es sarcástica")
else:
    print("La sentencia es sarcástica")

La sentencia no es sarcástica


In [None]:
prediction[0]

array([0.00216962], dtype=float32)

In [None]:
# La siguiente sentencia es sarcastica del portal web babylonbee.com

sentencia = "Capitol Janitors Deep Clean Senate Chamber With Flamethrowers"

# Preprocesamiento de la sentencia
preprocessed_sentence = [sentencia]  # Convertir la sentencia en una lista de cadenas de texto

# Obtener la predicción del modelo
prediction = model.predict(preprocessed_sentence)

# Interpretar la predicción
if prediction[0] < 0.5:
    print("La sentencia no es sarcástica")
else:
    print("La sentencia es sarcástica")

La sentencia es sarcástica


In [None]:
prediction[0]

array([0.55804247], dtype=float32)