# Practica 2 - Natural Language processing

In [91]:
import os
import pandas as pd

data_dir = "data"
train_path = os.path.join(data_dir, "train.csv")
test_path = os.path.join(data_dir, "test.csv")
df_train = pd.read_csv(train_path, encoding='ISO-8859-1', index_col="textID")
df_test = pd.read_csv(test_path, encoding='ISO-8859-1', index_col="textID")
df_train.head()

Unnamed: 0_level_0,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Kmï¿½),Density (P/Kmï¿½)
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [92]:
df_train.isna().sum()

text                 1
selected_text        1
sentiment            0
Time of Tweet        0
Age of User          0
Country              0
Population -2020     0
Land Area (Kmï¿½)    0
Density (P/Kmï¿½)    0
dtype: int64

In [93]:
df_test.isna().sum()

text                1281
sentiment           1281
Time of Tweet       1281
Age of User         1281
Country             1281
Population -2020    1281
Land Area (Km²)     1281
Density (P/Km²)     1281
dtype: int64

In [94]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [95]:
df_train["text"].isna().sum()

np.int64(0)

In [96]:
from tensorflow.keras.layers import TextVectorization

vectorize_layer = TextVectorization(
    output_mode='int',
    standardize="lower_and_strip_punctuation",
    split="whitespace",
)

corpus = df_train["text"].values

vectorize_layer.adapt(corpus)

vectorized_train = vectorize_layer(corpus)
vectorized_train

<tf.Tensor: shape=(27480, 33), dtype=int64, numpy=
array([[  293,    17, 15185, ...,     0,     0,     0],
       [  413,   115,     2, ...,     0,     0,     0],
       [    6,  1335,    10, ...,     0,     0,     0],
       ...,
       [  225,    31,    12, ...,     0,     0,     0],
       [   20,     9,    28, ...,     0,     0,     0],
       [   29,    30,  6480, ...,     0,     0,     0]])>

In [97]:
vectorize_layer.get_vocabulary()[:10]

['',
 '[UNK]',
 np.str_('i'),
 np.str_('to'),
 np.str_('the'),
 np.str_('a'),
 np.str_('my'),
 np.str_('and'),
 np.str_('you'),
 np.str_('it')]

In [98]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import skipgrams

train_ds = tf.data.Dataset.from_tensor_slices(vectorized_train)

vocab_size = len(vectorize_layer.get_vocabulary())
window_size = 3

def tf_skipgrams(sequence):
    sequence = tf.cast(sequence, tf.int32)
    
    def generate_pairs(x):
        pairs, _ = skipgrams(
            x.numpy(),
            vocabulary_size=vocab_size,
            window_size=window_size,
            negative_samples=0  # set to >0 if you want (target, context), label
        )
        if not pairs:
            # Return dummy data if skipgrams returns empty
            return tf.zeros((0, 2), dtype=tf.int32)
        return tf.convert_to_tensor(pairs, dtype=tf.int32)

    pairs_tensor = tf.py_function(func=generate_pairs, inp=[sequence], Tout=tf.int32)
    pairs_tensor.set_shape([None, 2])
    return tf.data.Dataset.from_tensor_slices(pairs_tensor)

train_ds = train_ds.flat_map(tf_skipgrams)

next(train_ds.as_numpy_iterator())

array([15185,    17], dtype=int32)

In [99]:
train_ds.element_spec

TensorSpec(shape=(2,), dtype=tf.int32, name=None)

In [100]:
train_ds = (
    train_ds
    .map(lambda x: (tf.one_hot(x[0], depth=vocab_size), tf.one_hot(x[1], depth=vocab_size)))
)
next(train_ds.as_numpy_iterator())

(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32))

In [101]:
train_ds.element_spec

(TensorSpec(shape=(29164,), dtype=tf.float32, name=None),
 TensorSpec(shape=(29164,), dtype=tf.float32, name=None))

In [102]:
from tensorflow.keras import Model, Input, layers
from tensorflow import keras

embedding_dim = 128

# Inputs are integer word indices
input_target = tf.keras.Input(shape=(), dtype=tf.int32)
input_context = tf.keras.Input(shape=(), dtype=tf.int32)

embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)

target_embed = embedding(input_target)
context_embed = embedding(input_context)

# Dot product
dot_product = tf.keras.layers.Dot(axes=-1)([target_embed, context_embed])
# We reshape to 1 to get the decoy.
dot_product = layers.Reshape((1,))(dot_product)

# We define the output layer
output = layers.Dense(
    1,
    activation = 'sigmoid')(dot_product)


model = Model(
    inputs = [target_embed, context_embed],
    outputs = output)

model.compile(
    loss = 'binary_crossentropy',
    optimizer = keras.optimizers.Adam(learning_rate = 0.025,
                                             beta_1 = 0.9,
                                             beta_2 = 0.999),
    metrics=['accuracy'])

model.summary()


In [None]:
# Definition of the TF Board callback
import numpy as np 

y_train = np.ones(len(X_target))
# Training process execution
# Epochs: 1
# Number of validation steps: 10
history = model.fit(
    X_train,
    y_train,
    epochs = 25,
    batch_size = 64,
    callbacks = [tensorboard_callback])

In [None]:
from tensorflow import keras
from tensorflow.keras import Model, Input, layers
from tensorflow.keras.layers import Embedding, Dot, Reshape, Dense

embedding_dim = 64

# We dimulate an embedding layer using a Dense layer
shared_embedding_layer = layers.Dense(
    embedding_dim,
    use_bias = False,
    name = 'embedding_dense')

# Define input layer, one for each element in the pairs.
target_input = Input(
    shape = (vocab_size,),
    name = 'target_onehot')

context_input = Input(
    shape = (vocab_size,),
    name = 'context_onehot')

# We apply the embedding layer to both inputs
target_vector = shared_embedding_layer(target_input)
context_vector = shared_embedding_layer(context_input)

# We combine both inputs using a Cosine/dot product similarity.
dot_product = layers.Dot(axes=-1)([target_vector, context_vector])
# We reshape to 1 to get the decoy.
dot_product = layers.Reshape((1,))(dot_product)

# We define the output layer
output = layers.Dense(
    1,
    activation = 'sigmoid')(dot_product)


model = Model(
    inputs = [target_input, context_input],
    outputs = output)

model.compile(
    loss = 'binary_crossentropy',
    optimizer = keras.optimizers.Adam(learning_rate = 0.025,
                                             beta_1 = 0.9,
                                             beta_2 = 0.999),
    metrics=['accuracy'])

model.summary()