## **05 ETL - TWEETS REALES**
## José Eduardo Viveros Escamilla | A01710605

In [4]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [16]:
df_live = pd.read_csv(r"C:\Users\josed\Documents\IA\BENJI\data\raw_data\real_tweets_tech_10000.csv")
df_live.head()

Unnamed: 0,date,user,text,label
0,2024-04-01T18:28:41,user_1293,Deep Learning makes my workflow so much easier!,1
1,2024-05-16T00:56:18,user_3162,OpenAI is trending again.,0
2,2025-01-09T17:55:06,user_833,Hot take: OpenAI is overrated.,0
3,2024-07-11T12:43:01,user_1163,"Wow, Deep Learning just blew my mind!",1
4,2024-12-03T06:04:02,user_485,People keep talking about Deep Learning today.,1


In [17]:
# ================================================================
# BLOQUE 3 — Función de limpieza (idéntica a v1 y v2)
# ================================================================

def clean_text(text):
    text = str(text).lower()

    # Quitar URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Quitar menciones
    text = re.sub(r"@\w+", "", text)

    # Quitar hashtags (solo el #)
    text = re.sub(r"#", "", text)

    # Quitar emojis básicos
    text = text.encode("ascii", "ignore").decode()

    # Quitar puntuación
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Quitar múltiples espacios
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Aplicar limpieza
df_live["text_clean"] = df_live["text"].apply(clean_text)
df_live[["text", "text_clean"]].head()


Unnamed: 0,text,text_clean
0,Deep Learning makes my workflow so much easier!,deep learning makes my workflow so much easier
1,OpenAI is trending again.,openai is trending again
2,Hot take: OpenAI is overrated.,hot take openai is overrated
3,"Wow, Deep Learning just blew my mind!",wow deep learning just blew my mind
4,People keep talking about Deep Learning today.,people keep talking about deep learning today


In [18]:
# ================================================================
# BLOQUE 4 — Seleccionar X (texto) y y (labels)
# ================================================================

X = df_live["text_clean"]
y = df_live["label"].astype(int)

print("Total muestras:", len(X))


Total muestras: 10000


In [19]:
# ================================================================
# BLOQUE 5 — Dividir Train / Val / Test
# ================================================================

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print("Tamaños:")
print("Train:", len(X_train))
print("Val:", len(X_val))
print("Test:", len(X_test))


Tamaños:
Train: 6000
Val: 2000
Test: 2000


In [21]:
# ================================================================
# BLOQUE 6 — Cargar tokenizer original del v1_base
# ================================================================

import pickle

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

tokenizer.num_words


20000

In [22]:
# ================================================================
# BLOQUE 7 — Tokenizar todo el texto
# ================================================================
# Se usa tokenización con EXACTA configuración del v1_base:
#   - mismo vocabulario
#   - misma longitud máxima (MAX_LEN = 50)
# ================================================================

MAX_LEN = 50

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq   = tokenizer.texts_to_sequences(X_val)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_val_pad   = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding="post")
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

X_train_pad.shape, X_val_pad.shape, X_test_pad.shape


((6000, 50), (2000, 50), (2000, 50))

In [23]:
import os

OUTPUT_DIR = "data/processed_data/v3_realtweets/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Carpeta creada:", OUTPUT_DIR)

Carpeta creada: data/processed_data/v3_realtweets/


In [25]:
np.save(OUTPUT_DIR +"X_train_live.npy", X_train_pad)
np.save(OUTPUT_DIR +"X_val_live.npy",   X_val_pad)
np.save(OUTPUT_DIR +"X_test_live.npy",  X_test_pad)

np.save(OUTPUT_DIR +"y_train_live.npy", y_train)
np.save(OUTPUT_DIR +"y_val_live.npy",   y_val)
np.save(OUTPUT_DIR +"y_test_live.npy",  y_test)

print("Archivos guardados.")

Archivos guardados.
