**Vectorizaciones para TODAS las variantes preprocesadas**
- Split único, fijo (random_state=0), reutilizado por variante
- Salida: PKL por (variante, representación)

In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix

**CONFIG GENERAL**

In [12]:
DATA_DIR = Path(r"C:\Users\Miner\OneDrive\Documentos\7to semestre\Procesamiento de Lenguaje Natural\Practice IV - Sentiment Analysis\PLN_P4\Archivos_Procesados")

OUT_DIR  = Path(r"C:\Users\Miner\OneDrive\Documentos\7to semestre\Procesamiento de Lenguaje Natural\Practice IV - Sentiment Analysis\PLN_P4\artifacts_all")

OUT_DIR.mkdir(parents=True, exist_ok=True)

In [13]:
TEXT_COL_TITLE = "title"
TEXT_COL_OPIN  = "opinion"
TARGET_COL     = "polarity"
DOCID_COL      = "doc_id" 

In [14]:
XLSX_FILES = [
    "Rest_Mex_2022_preprocessed__all_on.xlsx",
    "Rest_Mex_2022_preprocessed__minimal.xlsx",
    "Rest_Mex_2022_preprocessed__no_accents.xlsx",
    "Rest_Mex_2022_preprocessed__no_emoji_feats.xlsx",
    "Rest_Mex_2022_preprocessed__no_en2es.xlsx",
    "Rest_Mex_2022_preprocessed__no_lemma.xlsx",
    "Rest_Mex_2022_preprocessed__no_negjoin.xlsx",
    "Rest_Mex_2022_preprocessed__no_sel.xlsx",
    "Rest_Mex_2022_preprocessed__no_stopwords.xlsx",
]


**Parámetros de split**

In [15]:
TEST_SIZE = 0.2
SEED = 0
SPLIT_IDS_PATH = OUT_DIR / "split_ids.parquet"

**UTILIDADES**

In [16]:
def load_xlsx_text_target(path: Path):
    df = pd.read_excel(path)
    # Normalizar nombres:
    cols_l = {c.lower().strip(): c for c in df.columns}
    tcol = cols_l.get(TEXT_COL_TITLE.lower(), TEXT_COL_TITLE)
    ocol = cols_l.get(TEXT_COL_OPIN.lower(), TEXT_COL_OPIN)
    ycol = cols_l.get(TARGET_COL.lower(), TARGET_COL)

    # doc_id si no existe
    if DOCID_COL not in df.columns:
        df[DOCID_COL] = np.arange(len(df), dtype=int)

    # texto = titulo + opinion
    df["text"] = (df[tcol].fillna("").astype(str) + " " +
                  df[ocol].fillna("").astype(str)).str.strip()

    y = df[ycol].astype(int).values
    return df[[DOCID_COL, "text"]], y

def make_or_load_split_ids(df_ids_text: pd.DataFrame, y: np.ndarray):
    """
    Crea o carga ids de train/test para usar el MISMO split en todas las variantes.
    Compatible con pandas 2.0+ (sin .append).
    """
    if SPLIT_IDS_PATH.exists():
        ids = pd.read_parquet(SPLIT_IDS_PATH)
        train_ids = set(ids.loc[ids["split"] == "train", DOCID_COL].tolist())
        test_ids  = set(ids.loc[ids["split"] == "test",  DOCID_COL].tolist())
        return train_ids, test_ids

    # Split estratificado y reproducible
    tr_ids, te_ids = train_test_split(
        df_ids_text[DOCID_COL].values,
        test_size=TEST_SIZE,
        random_state=SEED,
        shuffle=True,
        stratify=y
    )

    # >>> reemplazo de .append por pd.concat <<<
    ids = pd.concat(
        [
            pd.DataFrame({DOCID_COL: tr_ids, "split": "train"}),
            pd.DataFrame({DOCID_COL: te_ids, "split": "test"})
        ],
        ignore_index=True
    )

    ids.to_parquet(SPLIT_IDS_PATH, index=False)  # requiere pyarrow o fastparquet
    return set(tr_ids), set(te_ids)
 

def get_texts_by_ids(df_ids_text: pd.DataFrame, train_ids: set, test_ids: set):
    tr = df_ids_text[df_ids_text[DOCID_COL].isin(train_ids)].sort_values(DOCID_COL)
    te = df_ids_text[df_ids_text[DOCID_COL].isin(test_ids)].sort_values(DOCID_COL)
    return tr["text"].tolist(), te["text"].tolist(), tr[DOCID_COL].tolist(), te[DOCID_COL].tolist()

def get_targets_by_ids(full_df_path: Path, train_ids: list, test_ids: list):
    df = pd.read_excel(full_df_path)
    cols_l = {c.lower().strip(): c for c in df.columns}
    ycol = cols_l.get(TARGET_COL.lower(), TARGET_COL)
    if DOCID_COL not in df.columns:
        df[DOCID_COL] = np.arange(len(df), dtype=int)
    df = df[[DOCID_COL, ycol]]
    df_train = df.set_index(DOCID_COL).loc[train_ids]
    df_test  = df.set_index(DOCID_COL).loc[test_ids]
    return df_train[ycol].astype(int).values, df_test[ycol].astype(int).values

**Vectorizadores**

In [17]:
def vec_binary(train_texts, test_texts, ngram=(1,1), min_df=1, max_features=None, lowercase=True):
    v = CountVectorizer(binary=True, ngram_range=ngram, min_df=min_df,
                        max_features=max_features, lowercase=lowercase)
    Xtr = v.fit_transform(train_texts); Xte = v.transform(test_texts)
    return Xtr, Xte, v

def vec_count(train_texts, test_texts, ngram=(1,2), min_df=3, max_features=None, lowercase=True):
    v = CountVectorizer(binary=False, ngram_range=ngram, min_df=min_df,
                        max_features=max_features, lowercase=lowercase)
    Xtr = v.fit_transform(train_texts); Xte = v.transform(test_texts)
    return Xtr, Xte, v

def vec_tfidf(train_texts, test_texts, ngram=(1,2), min_df=2, max_features=50000, lowercase=True):
    v = TfidfVectorizer(ngram_range=ngram, min_df=min_df,
                        max_features=max_features, lowercase=lowercase,
                        use_idf=True, sublinear_tf=True)
    Xtr = v.fit_transform(train_texts); Xte = v.transform(test_texts)
    return Xtr, Xte, v

def save_artifacts(Xtr, Xte, ytr, yte, vec, out_dir: Path, tag: str):
    out_dir.mkdir(parents=True, exist_ok=True)
    dump(Xtr, out_dir / f"X_train_{tag}.pkl")
    dump(Xte, out_dir / f"X_test_{tag}.pkl")
    dump(ytr, out_dir / f"y_train.pkl")
    dump(yte, out_dir / f"y_test.pkl")
    dump(vec, out_dir / f"vectorizer_{tag}.pkl")

**FLUJO PRINCIPAL**

In [18]:
# 1) Tomamos una variante "ancla" para crear el split (p.ej., minimal)
anchor_path = DATA_DIR / "Rest_Mex_2022_preprocessed__minimal.xlsx"
df_anchor, y_anchor = load_xlsx_text_target(anchor_path)
train_ids, test_ids = make_or_load_split_ids(df_anchor, y_anchor)

print(f"Split fijo -> train: {len(train_ids)}  test: {len(test_ids)}")

# 2) Recorremos todas las variantes y generamos las 3 representaciones
for fname in XLSX_FILES:
    fpath = DATA_DIR / fname
    var_name = fname.replace("Rest_Mex_2022_preprocessed__", "").replace(".xlsx", "")
    print(f"\n=== Variante: {var_name} ===")

    # cargar textos + y por ids del split
    df_ids_text, _y_dummy = load_xlsx_text_target(fpath)
    tr_texts, te_texts, tr_ids_sorted, te_ids_sorted = get_texts_by_ids(df_ids_text, train_ids, test_ids)
    ytr, yte = get_targets_by_ids(fpath, tr_ids_sorted, te_ids_sorted)

    # ---- BINARIA (unigramas) ----
    Xtr, Xte, vec = vec_binary(tr_texts, te_texts, ngram=(1,1), min_df=1)
    save_artifacts(Xtr, Xte, ytr, yte, vec, OUT_DIR / var_name / "binary", tag="binary")
    print("  Binaria:", Xtr.shape, Xte.shape, "Vocab:", len(vec.vocabulary_))

    # ---- FRECUENCIA (1-2gram, min_df=3) ----
    Xtr, Xte, vec = vec_count(tr_texts, te_texts, ngram=(1,2), min_df=3)
    save_artifacts(Xtr, Xte, ytr, yte, vec, OUT_DIR / var_name / "count", tag="count12_min3")
    print("  Frecuencia:", Xtr.shape, Xte.shape, "Vocab:", len(vec.vocabulary_))

    # ---- TF-IDF (1-2gram, max_features=50k) ----
    Xtr, Xte, vec = vec_tfidf(tr_texts, te_texts, ngram=(1,2), min_df=2, max_features=50000)
    save_artifacts(Xtr, Xte, ytr, yte, vec, OUT_DIR / var_name / "tfidf", tag="tfidf12_50k")
    print("  TF-IDF:", Xtr.shape, Xte.shape, "Vocab:", len(vec.vocabulary_))

Split fijo -> train: 24169  test: 6043

=== Variante: all_on ===
  Binaria: (24169, 45690) (6043, 45690) Vocab: 45690
  Frecuencia: (24169, 118054) (6043, 118054) Vocab: 118054
  TF-IDF: (24169, 50000) (6043, 50000) Vocab: 50000

=== Variante: minimal ===
  Binaria: (24169, 45690) (6043, 45690) Vocab: 45690
  Frecuencia: (24169, 118054) (6043, 118054) Vocab: 118054
  TF-IDF: (24169, 50000) (6043, 50000) Vocab: 50000

=== Variante: no_accents ===
  Binaria: (24169, 45690) (6043, 45690) Vocab: 45690
  Frecuencia: (24169, 118054) (6043, 118054) Vocab: 118054
  TF-IDF: (24169, 50000) (6043, 50000) Vocab: 50000

=== Variante: no_emoji_feats ===
  Binaria: (24169, 45690) (6043, 45690) Vocab: 45690
  Frecuencia: (24169, 118054) (6043, 118054) Vocab: 118054
  TF-IDF: (24169, 50000) (6043, 50000) Vocab: 50000

=== Variante: no_en2es ===
  Binaria: (24169, 45690) (6043, 45690) Vocab: 45690
  Frecuencia: (24169, 118054) (6043, 118054) Vocab: 118054
  TF-IDF: (24169, 50000) (6043, 50000) Vocab: 50

**Extras (Léxicos + Emojis/Emoticonos) y concatenación**

In [30]:
import re, json
import numpy as np
import pandas as pd
from pathlib import Path
from joblib import load, dump
from scipy.sparse import csr_matrix, hstack

**Parámetros del experimento de extra**

In [20]:
VARIANT_NAME = "all_on"          # p.ej.: "all_on", "no_emoji_feats", "no_lemma", ...
BASE_REPR    = "tfidf"           # "binary" | "count" | "tfidf"
BASE_TAGS = {                    # tags de archivo que usaste al guardar
    "binary": "binary",
    "count":  "count12_min3",
    "tfidf":  "tfidf12_50k",
}

In [21]:
BASE_DIR = OUT_DIR / VARIANT_NAME / BASE_REPR
OUT_EXTRAS_DIR = OUT_DIR / VARIANT_NAME / f"{BASE_REPR}_extras"
OUT_EXTRAS_DIR.mkdir(parents=True, exist_ok=True)



**Cargar split e insumos de texto**  
Reutilizamos tus utilidades y constantes: DATA_DIR, TEXT_COL_TITLE, TEXT_COL_OPIN, TARGET_COL, DOCID_COL, SPLIT_IDS_PATH

In [24]:
xlsx_path = DATA_DIR / f"Rest_Mex_2022_preprocessed__{VARIANT_NAME}.xlsx"


**Cargamos textos + y para alinear**

In [25]:
df_ids_text, _y_all = load_xlsx_text_target(xlsx_path)

**Cargamos ids del split fijo**

In [26]:
ids = pd.read_parquet(SPLIT_IDS_PATH)
train_ids = ids.loc[ids["split"]=="train", DOCID_COL].tolist()
test_ids  = ids.loc[ids["split"]=="test",  DOCID_COL].tolist()

**Alineamos textos en el orden del split**

In [27]:
def _align_texts(df_ids_text, ids):
    dd = df_ids_text.set_index(DOCID_COL).loc[ids]
    return dd["text"].astype(str).tolist()

X_train_text = _align_texts(df_ids_text, train_ids)
X_test_text  = _align_texts(df_ids_text, test_ids)

**Cargar matrices base (sparse) y vectorizador**

In [31]:
TAG = BASE_TAGS[BASE_REPR]
X_train_base = load(BASE_DIR / f"X_train_{TAG}.pkl")
X_test_base  = load(BASE_DIR / f"X_test_{TAG}.pkl")
y_train      = load(BASE_DIR / "y_train.pkl")
y_test       = load(BASE_DIR / "y_test.pkl")

**Construcción de FEATURES EXTRA**

In [32]:
emoji_pattern = re.compile(
    r"["
    r"\U0001F300-\U0001F5FF"  # símbolos y pictogramas
    r"\U0001F600-\U0001F64F"  # emoticonos
    r"\U0001F680-\U0001F6FF"  # transporte y mapas
    r"\U0001F700-\U0001F77F"
    r"\U0001F780-\U0001F7FF"
    r"\U0001F800-\U0001F8FF"
    r"\U0001F900-\U0001F9FF"
    r"\U0001FA00-\U0001FAFF"
    r"\u2600-\u26FF"
    r"\u2700-\u27BF"
    r"]", flags=re.UNICODE
)

def extract_emojis(text):
    return emoji_pattern.findall(text) if isinstance(text, str) else []


**Clasificación de emojis por polaridad APRENDIDA desde TRAIN**

In [33]:
# Etiquetas: 1= muy negativo, 2= negativo, 3= neutro, 4= positivo, 5= muy positivo
POS_LABELS = {4, 5}
NEG_LABELS = {1, 2}


y_train_ordered, y_test_ordered = get_targets_by_ids(xlsx_path, train_ids, test_ids)  

pos_counts, neg_counts = {}, {}
for txt, label in zip(X_train_text, y_train_ordered):
    emojis = extract_emojis(txt)
    if not emojis:
        continue
    if int(label) in POS_LABELS:
        for e in emojis:
            pos_counts[e] = pos_counts.get(e, 0) + 1
    elif int(label) in NEG_LABELS:
        for e in emojis:
            neg_counts[e] = neg_counts.get(e, 0) + 1

positive_emojis = {e for e in pos_counts if pos_counts[e] > neg_counts.get(e, 0)}
negative_emojis = {e for e in neg_counts if neg_counts[e] > pos_counts.get(e, 0)}


**Emoticonos comunes**

In [34]:
positive_emoticons = {":)", ":-)", ":D", "(:", "=)", ";)", ";-)", ":')"}
negative_emoticons = {":(", ":-(", "):", "='(", ":'(", "D:", ">:(", ":-/"}

def emoji_emoticon_features(text):
    """
    Devuelve: (emoji_pos+emo_pos, emoji_neg+emo_neg, exclamaciones)
    """
    t = str(text)
    # emojis
    any_emojis = extract_emojis(t)
    pos_e = sum(ch in positive_emojis for ch in any_emojis)
    neg_e = sum(ch in negative_emojis for ch in any_emojis)
    # emoticonos
    pos_em = sum(t.count(emo) for emo in positive_emoticons)
    neg_em = sum(t.count(emo) for emo in negative_emoticons)
    # intensidad
    exclam = t.count("!")
    return (pos_e + pos_em, neg_e + neg_em, exclam)

def build_emoji_features(text_list):
    feats = [emoji_emoticon_features(t) for t in text_list]
    return np.asarray(feats, dtype=np.float32)  # shape: (n_docs, 3)



**Léxicos simples**

In [35]:
positive_lexicon = {
    "bueno","excelente","perfecto","maravilloso","increíble","bonito","agradable",
    "recomendado","limpio","rápido","delicioso","rico","amable","fantástico","genial"
}
negative_lexicon = {
    "malo","terrible","horrible","pésimo","lento","sucio","caro",
    "decepcionante","desagradable","asqueroso","frío","tardado","estresante","feo"
}

def count_sentiment_words(text):
    toks = str(text).lower().split()
    pos = sum(t in positive_lexicon for t in toks)
    neg = sum(t in negative_lexicon for t in toks)
    return (pos, neg)

def build_lexicon_features(text_list):
    feats = [count_sentiment_words(t) for t in text_list]
    return np.asarray(feats, dtype=np.float32)  # shape: (n_docs, 2)

# 1) Construir extras para TRAIN/TEST
lex_train = build_lexicon_features(X_train_text)  # (n,2)
lex_test  = build_lexicon_features(X_test_text)   # (m,2)
emo_train = build_emoji_features(X_train_text)    # (n,3)
emo_test  = build_emoji_features(X_test_text)     # (m,3)

# 2) Concatenar extras a la matriz dispersa base
extra_train = np.hstack([lex_train, emo_train])   # (n, 5) -> [lex_pos, lex_neg, emoji_pos, emoji_neg, exclam]
extra_test  = np.hstack([lex_test,  emo_test])    # (m, 5)

X_train_ext = hstack([X_train_base, csr_matrix(extra_train)], format="csr")
X_test_ext  = hstack([X_test_base,  csr_matrix(extra_test)],  format="csr")

print(f"[{VARIANT_NAME} | {BASE_REPR}] Base:", X_train_base.shape, "→ Con extras:", X_train_ext.shape)

# 3) Guardar artefactos extendidos
dump(X_train_ext, OUT_EXTRAS_DIR / f"X_train_{TAG}_extras.pkl")
dump(X_test_ext,  OUT_EXTRAS_DIR / f"X_test_{TAG}_extras.pkl")
dump(y_train,     OUT_EXTRAS_DIR / "y_train.pkl")
dump(y_test,      OUT_EXTRAS_DIR / "y_test.pkl")

meta = {
    "variant": VARIANT_NAME,
    "base_repr": BASE_REPR,
    "base_tag": TAG,
    "extra_columns": ["lex_pos","lex_neg","emoji_pos","emoji_neg","exclam"],
    "train_shape": list(X_train_ext.shape),
    "test_shape":  list(X_test_ext.shape),
    "notes": "Emojis positivos/negativos aprendidos desde TRAIN según polaridad (1-5)."
}
with open(OUT_EXTRAS_DIR / "meta_extras.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Extras guardados en:", OUT_EXTRAS_DIR)
print("Meta:", meta)

[all_on | tfidf] Base: (24169, 50000) → Con extras: (24169, 50005)
Extras guardados en: C:\Users\Miner\OneDrive\Documentos\7to semestre\Procesamiento de Lenguaje Natural\Practice IV - Sentiment Analysis\PLN_P4\artifacts_all\all_on\tfidf_extras
Meta: {'variant': 'all_on', 'base_repr': 'tfidf', 'base_tag': 'tfidf12_50k', 'extra_columns': ['lex_pos', 'lex_neg', 'emoji_pos', 'emoji_neg', 'exclam'], 'train_shape': [24169, 50005], 'test_shape': [6043, 50005], 'notes': 'Emojis positivos/negativos aprendidos desde TRAIN según polaridad (1-5).'}
