In [95]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

In [96]:
df = pd.read_csv("../data/Checking.csv", encoding='iso-8859-1')
df.head()

Unnamed: 0,Date,nps,Comment
0,4/17/2023 20:23,9,PRECIO
1,4/17/2023 20:21,9,ES LA PRIMERA VEZ Y SI ME SIENTO CON ALGO DE I...
2,4/17/2023 20:17,9,Es muy rapido
3,4/17/2023 20:17,10,Bueno
4,4/17/2023 20:10,9,Prcticidad


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17860 entries, 0 to 17859
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     17860 non-null  object
 1   nps      17860 non-null  int64 
 2   Comment  17835 non-null  object
dtypes: int64(1), object(2)
memory usage: 418.7+ KB


In [91]:
df.drop(columns="Date", inplace=True)

In [88]:
bins = [6, 7, 8, 10]
labels = pd.cut(df.nps, bins=bins, labels=["M", "R", "B"])
labels.value_counts()

In [58]:
sents = ['too easy', 'agile', 'rapid', 'FOR THE RAPIDITY', 'super facil', 'TOO EASY', 'and', 'fast and efficient page', 'Very efficient',
 'It is very easy and fast.',  'great',  'Everything is very simple and fast, thank you',  'Easy to use',  'convenient and fast',
 'by the ease',  'service on webb',  'BECAUSE OF THE EASE OF THE PROCESS',  'Friendly',  'EASILY DOING IT, I AVOID PROCEDURES AT THE AIRPORT',
 'Flexibility.',  'good service',  'IT IS WITH WHOM I FLY',  'all excellent',  'Easy access',  'Simplicity',  'THE SERVICE IS FAST',
 'VERY EASY',  'SPEED AND PRACTICALITY',  'for the speed', 'very good service', 'good and easy to use', 'Very easy',
  'The whole process was quick and easy.',  'FOR ITS SPEED', 'ALL VERY WELL', 'excellent service', 'The website is very practical',
 'very well', 'Of course', 'AGILE AND EASY PROCESS']

In [60]:
def custom_standardization(data):
    lower = tf.strings.lower(data)
    return lower

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(df[["nps", "comment"]].to_numpy()).batch(32)
# list(dataset.as_numpy_iterator())

In [62]:
vocab_size = 10000
vectorizer = TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=100)

In [63]:
vectorizer.adapt(df.Comment.to_numpy())



In [64]:
vectorizer.vocabulary_size()

61

In [65]:
print(vectorizer.get_vocabulary()[5:15])

output = vectorizer([["i like it a lot"]])
print(output.numpy()[0, :6])

['and', 'service', 'is', 'fast', 'speed', 'process', 'of', 'it', 'good', 'for']

In [67]:
embedding_dim=16
n_classes = len(df.nps.unique())

model = Sequential([
  vectorizer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(n_classes)
])

In [None]:
from sklearn.model_selection import train_test_split

train_samples, val_samples, train_labels, val_labels = train_test_split(df.comment, pd.get_dummies(df.nps), test_size=0.2, random_state=42)

In [None]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [68]:
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=["accuracy"])

In [69]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
model.fit(x_train,
    y_train,
    epochs=15,
    batch_size=32,
    validation_data=(x_val, y_val),
    callbacks=[tensorboard_callback])