# Practica 2 - Natural Language processing

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

from tensorflow import keras
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import Model, Input, layers
from tensorflow.keras.layers import Embedding, Dot, Reshape, Dense, TextVectorization

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams

from keras.callbacks import TensorBoard

import seaborn as sns
import os

In [2]:
df_train = pd.read_csv(os.path.join("data", "train.csv"), sep=',', header=0, encoding='ISO-8859-1', index_col="textID")
df_test = pd.read_csv(os.path.join("data", "test.csv"), sep=',', header=0, encoding='ISO-8859-1', index_col="textID")

Comprobamos si hay NAs en el dataset: 

In [3]:
df_train.isna().sum()

text                1
selected_text       1
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64

Podemos observar que hay uno:

In [4]:
df_train[df_train.isna().any(axis = 1)]

Unnamed: 0_level_0,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
fdb77c3752,,,neutral,night,31-45,Namibia,2540905,823000.0,3


In [5]:
df_train = df_train.dropna()
df_train.isna().sum()

text                0
selected_text       0
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64

In [6]:
df_test.isna().sum()

text                1281
sentiment           1281
Time of Tweet       1281
Age of User         1281
Country             1281
Population -2020    1281
Land Area (Km²)     1281
Density (P/Km²)     1281
dtype: int64

In [7]:
df_test[df_test.isna().any(axis = 1)].head(5)

Unnamed: 0_level_0,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,,,,,,,,
,,,,,,,,
,,,,,,,,
,,,,,,,,
,,,,,,,,


In [8]:
df_test = df_test.dropna()
df_test.isna().sum()

text                0
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64

In [9]:
corpus = df_train['text'].values

In [10]:
vectorize_layer = TextVectorization(
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    output_mode='int'
)

In [11]:
vectorize_layer.adapt(corpus)

In [12]:
vectorized_train = vectorize_layer(df_train["text"].values)

In [13]:
vectorized_train

<tf.Tensor: shape=(27480, 33), dtype=int64, numpy=
array([[  293,    17, 15185, ...,     0,     0,     0],
       [  413,   115,     2, ...,     0,     0,     0],
       [    6,  1335,    10, ...,     0,     0,     0],
       ...,
       [  225,    31,    12, ...,     0,     0,     0],
       [   20,     9,    28, ...,     0,     0,     0],
       [   29,    30,  6480, ...,     0,     0,     0]])>

In [21]:
vocab = vectorize_layer.get_vocabulary()

In [27]:
token_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_token = {idx: word for idx, word in enumerate(vocab)}

In [52]:
text_ds = tf.data.Dataset.from_tensor_slices(df_train["text"].values)

In [54]:
vectorized_ds = text_ds.map(lambda x: vectorize_layer(x))

In [None]:
vocab = vectorize_layer.get_vocabulary()
vocab_size = len(vocab)

def tf_skipgrams(sequence):
    sequence = sequence.numpy()
    couples, labels = skipgrams(
        sequence=sequence,
        vocabulary_size=vocab_size,
        window_size=4,
    )
    if len(couples) == 0:
        return (tf.constant([[0, 0]], dtype=tf.int32), tf.constant([0], dtype=tf.int32))
    return (tf.constant(couples, dtype=tf.int32), tf.constant(labels, dtype=tf.int32))

def tf_skipgrams_wrapper(sequence):
    couples, labels = tf.py_function(tf_skipgrams, [sequence], [tf.int32, tf.int32])
    couples.set_shape([None, 2])
    labels.set_shape([None])
    return couples, labels

skipgram_ds = vectorized_ds.map(tf_skipgrams_wrapper)

In [None]:
skipgram_ds

<_MapDataset element_spec=(TensorSpec(shape=(None, 2), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>