# Practica 2 - Natural Language processing

## Librerías

In [1]:
import numpy as np

import tensorflow as tf

import pandas as pd
from collections import Counter
import re

from tensorflow import keras


import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import Model, Input, layers
from tensorflow.keras.layers import Embedding, Dot, Reshape, Dense

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams

from keras.callbacks import TensorBoard

import re
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer

import numpy as np
from tensorflow.keras.preprocessing.sequence import skipgrams

## Funciones

## Descarga y proceso de datos

### Carga de datos

In [2]:
import os
import pandas as pd

# 1. Directorio portable a P2_NaturalLanguageProcessing/data
cwd      = os.getcwd()
data_dir = os.path.join(cwd, 'data')

# 2. Rutas a los CSV
train_path = os.path.join(data_dir, 'train.csv')
test_path  = os.path.join(data_dir, 'test.csv')

# 3. Lista de codificaciones a probar
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']

def load_csv(path, enc_list):
    for enc in enc_list:
        try:
            df = pd.read_csv(path, encoding=enc)
            print(f"Cargado '{os.path.basename(path)}' con encoding {enc}")
            return df, enc
        except UnicodeDecodeError:
            print(f"Fallo con encoding {enc}, probando siguiente...")
    raise ValueError(f"No se pudo decodificar {path} con las codificaciones {enc_list}")

# 4. Cargar ambos conjuntos usando la misma codificación
train_df, used_enc = load_csv(train_path, encodings)
test_df, _      = load_csv(test_path, [used_enc])

# 5. Verificación
print(f"Train shape: {train_df.shape}")
print(f"Test  shape: {test_df.shape}")


Fallo con encoding utf-8, probando siguiente...
Cargado 'train.csv' con encoding latin1
Cargado 'test.csv' con encoding latin1
Train shape: (27481, 10)
Test  shape: (4815, 9)


### Exploración de datos

In [3]:
# 5. Juntar los DataFrames para facilitar el preprocesado
data_df = pd.concat([train_df, test_df], ignore_index=True)      # concatenación[2]

# 6. Inspeccionar estructura básica
print(data_df.info())
print(data_df.head())

# 7. Contar palabras únicas en la columna 'text' para estimar VOCAB_SIZE
all_text = ' '.join(data_df['text'].astype(str)).lower()
words = re.findall(r'\b\w+\b', all_text)
word_counts = Counter(words)
unique_words = len(word_counts)

print(f"Total de muestras: {data_df.shape[0]}")
print(f"Palabras únicas encontradas: {unique_words}")
print("Top 10 palabras más frecuentes:", word_counts.most_common(10))

# 8. Definir parámetros para TensorFlow/Keras
BUFFER_SIZE = 32768      # chivo mayor al dataset para buen shuffle 2^15
BATCH_SIZE = 128        # potencia de 2 adecuada para GPU
VOCAB_SIZE = 8192      # tamaño del vocabulario[4]
                 
print("BUFFER_SIZE =", BUFFER_SIZE)
print("BATCH_SIZE =", BATCH_SIZE)
print("VOCAB_SIZE =", VOCAB_SIZE)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32296 entries, 0 to 32295
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            31015 non-null  object 
 1   text              31014 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         31015 non-null  object 
 4   Time of Tweet     31015 non-null  object 
 5   Age of User       31015 non-null  object 
 6   Country           31015 non-null  object 
 7   Population -2020  31015 non-null  float64
 8   Land Area (Km²)   31015 non-null  float64
 9   Density (P/Km²)   31015 non-null  float64
dtypes: float64(3), object(7)
memory usage: 2.5+ MB
None
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...  

### Proceso de datos

#### Creación del corpus

In [4]:
# 1. Seleccionar las 4 096 palabras más frecuentes
most_common_words = {w for w, _ in word_counts.most_common(VOCAB_SIZE)}

# 2. Filtrar cada texto para quedarnos solo con tokens en el top 4 096
corpus_filtered = []
for text in data_df['text'].dropna().astype(str):
    tokens = re.findall(r'\b\w+\b', text.lower())
    filtered_tokens = [t for t in tokens if t in most_common_words]
    corpus_filtered.append(" ".join(filtered_tokens))

# 3. Tokenizar el corpus filtrado
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(corpus_filtered)

# 4. Convertir textos a secuencias de índices
sequences = tokenizer.texts_to_sequences(corpus_filtered)

# 5. Diccionarios de mapeo y tamaño final de vocabulario
word2idx = tokenizer.word_index
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = min(len(word2idx) + 1, VOCAB_SIZE)

# 6. Mostrar resultados de prueba
print("Primeros 5 textos filtrados:", corpus_filtered[:5])
print("Primeras 5 secuencias:", sequences[:5])
print("Tamaño de vocabulario efectivo:", vocab_size)


Primeros 5 textos filtrados: ['i d have responded if i were going', 'sooo sad i will miss you here in san diego', 'my boss is me', 'what interview leave me alone', 'sons of why couldn t they put them on the releases we already bought']
Primeras 5 secuencias: [[1, 163, 19, 7648, 71, 1, 151, 49], [421, 117, 1, 63, 94, 7, 91, 10, 1447, 2230], [5, 1410, 9, 16], [51, 1193, 350, 16, 495], [4254, 13, 118, 472, 14, 72, 332, 131, 17, 3, 7649, 50, 210, 569]]
Tamaño de vocabulario efectivo: 8152


#### Generación de los pares de entrenamiento

In [5]:
# Parámetros
window_size      = 2
negative_samples = 0.0
seed_value       = 42               # cualquier entero

pairs = []
for seq in sequences:
    if len(seq) < 2:
        continue

    # pasar seed para evitar el randint interno con float
    sg_pairs, _ = skipgrams(
        sequence=seq,
        vocabulary_size=vocab_size,
        window_size=window_size,
        negative_samples=negative_samples,
        shuffle=True,
        seed=seed_value
    )
    pairs.extend(sg_pairs)

# separar en arrays
if pairs:
    targets, contexts = zip(*pairs)
    targets  = np.array(targets,  dtype='int32')
    contexts = np.array(contexts, dtype='int32')
else:
    targets  = np.zeros((0,), dtype='int32')
    contexts = np.zeros((0,), dtype='int32')

print("Primeros pares (t→c):", list(zip(targets[:10], contexts[:10])))
print("Total pares:", len(pairs))


Primeros pares (t→c): [(np.int32(71), np.int32(151)), (np.int32(7648), np.int32(1)), (np.int32(7648), np.int32(163)), (np.int32(49), np.int32(151)), (np.int32(1), np.int32(151)), (np.int32(19), np.int32(163)), (np.int32(19), np.int32(1)), (np.int32(7648), np.int32(19)), (np.int32(71), np.int32(1)), (np.int32(151), np.int32(1))]
Total pares: 1384586
