In [57]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

In [58]:
# df = pd.read_csv("./tinker/checking_tinker.csv", encoding='iso-8859-1')
df = pd.read_csv("../data/Checking.csv", encoding='iso-8859-1')
df.head()

Unnamed: 0,Date,nps,Comment
0,4/17/2023 20:23,9,PRECIO
1,4/17/2023 20:21,9,ES LA PRIMERA VEZ Y SI ME SIENTO CON ALGO DE I...
2,4/17/2023 20:17,9,Es muy rapido
3,4/17/2023 20:17,10,Bueno
4,4/17/2023 20:10,9,Prcticidad


In [59]:
df.dropna(inplace=True)

bins = [6, 7, 8, 10]
df["labels"] = pd.cut(df.nps.astype(int), bins=bins, labels=["M", "R", "B"])

In [60]:
df.drop(columns=["Date", "nps"], inplace=True)

In [61]:
df.head()

Unnamed: 0,Comment,labels
0,PRECIO,B
1,ES LA PRIMERA VEZ Y SI ME SIENTO CON ALGO DE I...,B
2,Es muy rapido,B
3,Bueno,B
4,Prcticidad,B


In [62]:
batch_size = 128

dataset = tf.data.Dataset.from_tensor_slices(df["Comment"].to_numpy()).batch(batch_size)
# list(dataset.as_numpy_iterator())

In [67]:
import string
import re

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

In [68]:
vocab_size = 2000

vectorizer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    # output_mode='int',
    output_sequence_length=100)

vectorizer.adapt(df["Comment"].to_numpy())

In [65]:
vectorizer.vocabulary_size()

2000

In [70]:
print(vectorizer.get_vocabulary()[5:15])

output = vectorizer([["hola"]])
print(output.numpy()[0, :6])

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 2: unexpected end of data

In [None]:
classes = ["B", "M", "R"]

In [None]:
embedding_dim=300
n_classes = len(classes)

embedding_layer = Embedding(vocab_size, embedding_dim, name="embedding")

In [None]:
from sklearn.model_selection import train_test_split

train_samples, val_samples, train_labels, val_labels = train_test_split(df["Comment"]to_numpy(), pd.get_dummies(df.labels).to_numpy(), test_size=0.2, random_state=42)

In [None]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(n_classes, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 300)         600000    
                                                                 
 conv1d_4 (Conv1D)           (None, None, 128)         192128    
                                                                 
 global_max_pooling1d_2 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                           

In [None]:
adam = tf.optimizers.Adam()
cce = tf.keras.losses.CategoricalCrossentropy(from_logits=False)

model.compile(optimizer=adam, loss = cce, metrics=['accuracy'])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
model.fit(x=x_train,
    y=y_train,
    epochs=10,
    batch_size=batch_size,
    validation_data=(x_val, y_val),
    callbacks=[tensorboard_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21647a3cc40>

In [None]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["this is horrible"]]
)

classes[np.argmax(probabilities[0])]



'B'