# Practica 2 - Natural Language processing

In [9]:
import gdown
import os

# Crear el directorio de datos si no existe
data_dir = "data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# URL de Google Drive en formato correcto para gdown
url = "https://drive.google.com/uc?id=1GXSUzaXDvrlcimgTwRR9tmDr6xr8gu49"
zip_filename = "sentiment_analysis.zip"
zip_path = os.path.join(data_dir, zip_filename)

# Descargar el archivo
if not os.path.exists(zip_path):
    gdown.download(url, zip_path, quiet=False)
else:
    print("Data zipfile already exists")

Downloading...
From: https://drive.google.com/uc?id=1GXSUzaXDvrlcimgTwRR9tmDr6xr8gu49
To: /content/data/sentiment_analysis.zip
100%|██████████| 1.86M/1.86M [00:00<00:00, 185MB/s]


In [10]:
import shutil
from zipfile import ZipFile
from concurrent.futures import ThreadPoolExecutor

data_dir = "data"
zip_filename = "sentiment_analysis.zip"
zip_path = os.path.join(data_dir, zip_filename)
files = ["train.csv", "test.csv"]
full_paths = [os.path.join(data_dir, file) for file in files]

if not all(os.path.isfile(path) for path in full_paths):
    with ZipFile(zip_path, 'r') as zf:
        with ThreadPoolExecutor() as exe:
            for file in zf.namelist():
                if not file.startswith("__MACOSX"):
                    exe.submit(zf.extract, file, path=data_dir)
else:
    print("test, train and valid folders already exist")

In [None]:
import os
import pandas as pd

data_dir = "data"
train_path = os.path.join(data_dir, "train.csv")
test_path = os.path.join(data_dir, "test.csv")
df_train = pd.read_csv(train_path, encoding='ISO-8859-1', index_col="textID")
df_test = pd.read_csv(test_path, encoding='ISO-8859-1', index_col="textID")
df_train.head()

Unnamed: 0_level_0,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [None]:
df_train.isna().sum()

Unnamed: 0,0
text,1
selected_text,1
sentiment,0
Time of Tweet,0
Age of User,0
Country,0
Population -2020,0
Land Area (Km²),0
Density (P/Km²),0


In [None]:
df_test.isna().sum()

Unnamed: 0,0
text,1281
sentiment,1281
Time of Tweet,1281
Age of User,1281
Country,1281
Population -2020,1281
Land Area (Km²),1281
Density (P/Km²),1281


In [None]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [None]:
df_train["text"].isna().sum()

np.int64(0)

In [None]:
!pip install tensorflow



In [None]:
from tensorflow.keras.layers import TextVectorization

vectorize_layer = TextVectorization(
    output_mode='int',
    standardize="lower_and_strip_punctuation",
    split="whitespace",
)

corpus = df_train["text"].values

vectorize_layer.adapt(corpus)

vectorized_train = vectorize_layer(corpus)
vectorized_train

<tf.Tensor: shape=(27480, 33), dtype=int64, numpy=
array([[  293,    17, 15185, ...,     0,     0,     0],
       [  413,   115,     2, ...,     0,     0,     0],
       [    6,  1335,    10, ...,     0,     0,     0],
       ...,
       [  225,    31,    12, ...,     0,     0,     0],
       [   20,     9,    28, ...,     0,     0,     0],
       [   29,    30,  6480, ...,     0,     0,     0]])>

In [None]:
vectorize_layer.get_vocabulary()[:10]

['',
 '[UNK]',
 np.str_('i'),
 np.str_('to'),
 np.str_('the'),
 np.str_('a'),
 np.str_('my'),
 np.str_('and'),
 np.str_('you'),
 np.str_('it')]

In [None]:
import tensorflow as tf
train_ds = tf.data.Dataset.from_tensor_slices(vectorized_train)

vocab_size = len(vectorize_layer.get_vocabulary())
window_size = 3

print(train_ds.element_spec)
train_ds.as_numpy_iterator().next()

TensorSpec(shape=(33,), dtype=tf.int64, name=None)


array([  293,    17, 15185,    69,     2,   120,    47,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0])

In [None]:
import tensorflow as tf

window_size = 2  # Change this as needed

def generate_skipgram_pairs(sequence):
    seq_len = tf.shape(sequence)[0]
    positions = tf.range(seq_len)

    def extract_context(i):
        start = tf.maximum(0, i - window_size)
        end = tf.minimum(seq_len, i + window_size + 1)

        # Exclude the center word itself
        left = sequence[start:i]
        right = sequence[i + 1:end]
        context = tf.concat([left, right], axis=0)

        target = sequence[i]
        targets = tf.fill([tf.shape(context)[0]], target)

        return tf.data.Dataset.from_tensor_slices((targets, context))

    return tf.data.Dataset.from_tensor_slices(positions).flat_map(extract_context)


In [None]:
# your input dataset of shape (33,), dtype=tf.int64

# Convert to skipgram pairs
skipgram_ds = train_ds.flat_map(generate_skipgram_pairs)
print(skipgram_ds.element_spec)
skipgram_ds.as_numpy_iterator().next()

(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


(np.int64(293), np.int64(17))

In [None]:
train_ds = (
    train_ds
    .map(
        lambda x: ((tf.one_hot(x[0], depth=vocab_size), tf.one_hot(x[1], depth=vocab_size)), tf.constant(1.0)),
        num_parallel_calls=tf.data.AUTOTUNE,
    )
)
print(train_ds.element_spec)
next(train_ds.as_numpy_iterator())

((TensorSpec(shape=(29164,), dtype=tf.float32, name=None), TensorSpec(shape=(29164,), dtype=tf.float32, name=None)), TensorSpec(shape=(), dtype=tf.float32, name=None))


((array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
  array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)),
 np.float32(1.0))

In [None]:
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dot, Activation

# vocab_size = 29164
embedding_dim = 128

# One-hot inputs (shape = (vocab_size,))
input_target = Input(shape=(vocab_size,))
input_context = Input(shape=(vocab_size,))

# Shared embedding layer simulated with Dense layer
embedding_layer = Dense(embedding_dim, use_bias=False)

target_embedding = embedding_layer(input_target)   # shape: (embedding_dim,)
context_embedding = embedding_layer(input_context) # same

# Dot product of embeddings
dot_product = Dot(axes=-1)([target_embedding, context_embedding])

# Sigmoid output for skipgram-style binary prediction
output = Activation('sigmoid')(dot_product)

model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy')


In [None]:
tf.data.experimental.cardinality(train_ds).numpy()

np.int64(27480)

In [None]:
model.fit(train_ds.batch(128).prefetch(tf.data.AUTOTUNE), epochs=5)

Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 218ms/step - loss: 0.6478
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 202ms/step - loss: 0.3077
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 204ms/step - loss: 0.1741
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 202ms/step - loss: 0.1095
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 199ms/step - loss: 0.0722


<keras.src.callbacks.history.History at 0x7fb6e95e4510>

In [None]:
embeddings = embedding_layer.get_weights()[0]

print(embeddings.shape)

(29164, 128)


In [None]:
# prompt: given vectorize_layer how to obtain the index for the word "what"

import numpy as np

# Get the vocabulary from the vectorize_layer
vocabulary = vectorize_layer.get_vocabulary()

# Find the index of the word "what"
try:
    what_index = vocabulary.index("what")
    print(f"The index of 'what' in the vocabulary is: {what_index}")
except ValueError:
    print("'what' is not found in the vocabulary.")


The index of 'what' in the vocabulary is: 57


In [None]:
embeddings[what_index].shape

(128,)

In [None]:
print(train_ds.element_spec)
next(train_ds.as_numpy_iterator())

((TensorSpec(shape=(29164,), dtype=tf.float32, name=None), TensorSpec(shape=(29164,), dtype=tf.float32, name=None)), TensorSpec(shape=(), dtype=tf.float32, name=None))


((array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
  array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)),
 np.float32(1.0))

In [None]:
classifier_train_ds = tf.data.Dataset.from_tensor_slices((vectorized_train, df_train["sentiment"].values))

print(classifier_train_ds.element_spec)
next(classifier_train_ds.as_numpy_iterator())

(TensorSpec(shape=(33,), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))


(array([  293,    17, 15185,    69,     2,   120,    47,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0]),
 b'neutral')

In [None]:
embedding_layer = tf.keras.layers.Embedding(
    input_dim=embeddings.shape[0],
    output_dim=embeddings.shape[1],
    weights=[embeddings],
    trainable=False
)

def embed_and_flatten(indices, label):
    embedded = embedding_layer(indices)        # shape: (seq_len, embed_dim)
    flat = tf.reshape(embedded, [-1])          # shape: (seq_len * embed_dim,)
    return flat, label

In [None]:
classifier_train_ds = classifier_train_ds.map(embed_and_flatten)
print(classifier_train_ds.element_spec)
next(classifier_train_ds.as_numpy_iterator())

(TensorSpec(shape=(4224,), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))


(array([-0.17132318, -0.15126888, -0.17530611, ...,  0.4272256 ,
        -0.41871053, -0.40135565], dtype=float32),
 b'neutral')

In [None]:
# Define a lookup table to convert string → integer
label_lookup = tf.keras.layers.StringLookup(
    vocabulary=df_train["sentiment"].unique().tolist(),
    num_oov_indices=0,
)

# Optional: one-hot encode
num_classes = label_lookup.vocabulary_size()

classifier_train_ds = classifier_train_ds.map(lambda x, y: (x, tf.one_hot(label_lookup(y), depth=num_classes)))

print(classifier_train_ds.element_spec)
next(classifier_train_ds.as_numpy_iterator())

(TensorSpec(shape=(4224,), dtype=tf.float32, name=None), TensorSpec(shape=(3,), dtype=tf.float32, name=None))


(array([-0.17132318, -0.15126888, -0.17530611, ...,  0.4272256 ,
        -0.41871053, -0.40135565], dtype=float32),
 array([1., 0., 0.], dtype=float32))

In [None]:
classifier_train_ds = classifier_train_ds.batch(128)

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(classifier_train_ds, epochs=3)


Epoch 1/3
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.3925 - loss: 0.6410
Epoch 2/3
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4302 - loss: 0.6215
Epoch 3/3
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4364 - loss: 0.6138


<keras.src.callbacks.history.History at 0x7fb6dc72b1d0>