In [1]:
# Importaciones básicas e instalación de recursos NLTK


import os, re, json
from collections import Counter, defaultdict

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords, wordnet as wn

# Descarga simple de recursos NLTK si faltan
try:
    _ = stopwords.words("english")
except LookupError:
    nltk.download("stopwords")
try:
    _ = wn.synsets("good")
except LookupError:
    nltk.download("wordnet")
    nltk.download("omw-1.4")


In [2]:
# Configuracion

DATA_PATH = "dataset\df_200k.csv"  # ruta al CSV
TEXT_COL  = "text"                               # nombre de la columna de texto
ID_COL    = None                                 # usa None si no tienes id
LANG      = "en"                                 # "en" para inglés, "es" para español
USE_WORDNET = True                               # activa/desactiva expansión con WordNet
TAU_PERCENTILE = 35                               # umbral de neutralidad (percentil)
ASPECT_WINDOW = 5                                 # no se usa aquí (dependencias), lo dejamos por claridad

# Opcional: si quieres muestrear para pruebas rápidas (ej. 10000)
SAMPLE_N = None  # ej. 10000 o None para todo


In [3]:
# ⬅️ Celda 3: Carga del dataset

df = pd.read_csv(DATA_PATH, encoding="utf-8", on_bad_lines="skip")
if SAMPLE_N:
    df = df.sample(SAMPLE_N, random_state=42).reset_index(drop=True)

assert TEXT_COL in df.columns, f"No encuentro la columna de texto '{TEXT_COL}'"
texts = df[TEXT_COL].astype(str).tolist()
ids = df[ID_COL].tolist() if (ID_COL and ID_COL in df.columns) else list(range(len(texts)))

print(f"Docs cargados: {len(texts)}")
df.head(3)


Docs cargados: 200000


Unnamed: 0,target,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [4]:
# Preprocesado y tokenización
# Buscamos: lowercase, normalizar URLs/usuarios/hashtags, reducir elongaciones, tokenizar y quitar stopwords.

URL_RE       = re.compile(r"https?://\S+|www\.\S+")
USER_RE      = re.compile(r"@\w+")
HASHTAG_RE   = re.compile(r"#")
MULTICHAR_RE = re.compile(r"(.)\1{2,}")  # "buenoooo" -> "buenoo"
TOKEN_RE     = re.compile(r"[a-záéíóúñü]+(?:'[a-záéíóúñü]+)?")

def normalize_text(s: str) -> str:
    s = s.lower()
    s = URL_RE.sub(" <url> ", s)
    s = USER_RE.sub(" <user> ", s)
    s = HASHTAG_RE.sub("", s)           # quita "#"
    s = MULTICHAR_RE.sub(r"\1\1", s)    # limita repes a 2
    # separa "muyBueno" -> "muy bueno" (simple)
    s = re.sub(r"([a-záéíóúñ])([A-ZÁÉÍÓÚÑ])", r"\1 \2", s)
    return s

def tokenize(s: str, lang: str="en"):
    tokens = TOKEN_RE.findall(s)
    sw_lang = "spanish" if lang == "es" else "english"
    sw = set(stopwords.words(sw_lang))
    # quitamos stopwords muy básicas; queremos quedarnos con señales de opinión
    tokens = [t for t in tokens if t not in sw]
    return tokens

norm_texts = [normalize_text(t) for t in texts]
token_docs = [tokenize(t, LANG) for t in norm_texts]

# Frecuencias por si luego queremos filtrar WordNet por "existe en corpus"
token_freq = Counter([tok for doc in token_docs for tok in doc])

print("Ejemplo tokens:", token_docs[0][:20] if token_docs else [])


Ejemplo tokens: ['user', 'url', 'aww', "that's", 'bummer', 'shoulda', 'got', 'david', 'carr', 'third', 'day']


In [5]:
# Léxico base + expansión con WordNet
# Mantenemos semillas cortas y claras. La expansión se filtra por "aparece al menos 1 vez en corpus".

SEEDS_ES_POS = {"bueno","excelente","fantastico","fantástico","util","útil","rapido","rápido","genial","positivo","satisfecho","perfecto","increible","increíble","mejor","encanta"}
SEEDS_ES_NEG = {"malo","terrible","pesimo","pésimo","inutil","inútil","lento","horrible","negativo","insatisfecho","defectuoso","odio","peor","asco"}
SEEDS_EN_POS = {"good","great","excellent","fantastic","useful","fast","awesome","positive","happy","satisfied","perfect","amazing","love","better"}
SEEDS_EN_NEG = {"bad","terrible","awful","useless","slow","horrible","negative","unsatisfied","defective","hate","worst","poor","worse"}

NEGATORS_ES = {"no","nunca","jamas","jamás","sin"}
NEGATORS_EN = {"no","not","never","without"}
INTENSIFIERS_ES = {"muy":1.5,"super":1.5,"súper":1.5,"re":1.3,"poco":0.5}
INTENSIFIERS_EN = {"very":1.5,"super":1.5,"really":1.3,"so":1.3,"slightly":0.7,"little":0.5}

def expand_wordnet(words: set, lang: str, token_freq: Counter) -> set:
    """Expansión: sinónimos de WordNet; nos quedamos con unigrams que aparezcan al menos 1 vez."""
    out = set(words)
    for w in list(words):
        if lang == "es":
            syns = wn.synsets(w, lang="spa")
            for s in syns:
                for lemma in s.lemma_names("spa"):
                    lemma = lemma.replace("_", " ").lower()
                    if " " in lemma: 
                        continue
                    if token_freq.get(lemma, 0) < 1:
                        continue
                    out.add(lemma)
        else:
            syns = wn.synsets(w)
            for s in syns:
                for lemma in s.lemma_names():
                    lemma = lemma.replace("_", " ").lower()
                    if " " in lemma:
                        continue
                    if token_freq.get(lemma, 0) < 1:
                        continue
                    out.add(lemma)
    return out

if LANG == "es":
    pos_set, neg_set = set(SEEDS_ES_POS), set(SEEDS_ES_NEG)
    NEGATORS = NEGATORS_ES
    INTENS   = INTENSIFIERS_ES
else:
    pos_set, neg_set = set(SEEDS_EN_POS), set(SEEDS_EN_NEG)
    NEGATORS = NEGATORS_EN
    INTENS   = INTENSIFIERS_EN

if USE_WORDNET:
    pos_set = expand_wordnet(pos_set, LANG, token_freq)
    neg_set = expand_wordnet(neg_set, LANG, token_freq)

lexicon = {w:  1.0 for w in pos_set}
lexicon.update({w: -1.0 for w in neg_set})

print(f"Tamaño léxico: {len(lexicon)} (pos={len(pos_set)}, neg={len(neg_set)})")
list(lexicon.items())[:10]


Tamaño léxico: 190 (pos=121, neg=74)


[('fast', 1.0),
 ('splendid', 1.0),
 ('positive', 1.0),
 ('groovy', 1.0),
 ('dearest', 1.0),
 ('honey', 1.0),
 ('everlasting', 1.0),
 ('gravel', 1.0),
 ('screw', 1.0),
 ('break', 1.0)]

In [6]:
# Clasificador por reglas (score + etiquetas)
# Los intensificadores multiplican el siguiente término polar; negación invierte 3 términos siguientes.

def score_document(tokens, lexicon, negators, intensifiers):
    score = 0.0
    negate_left = 0
    mult = 1.0
    for tok in tokens:
        if tok in negators:
            negate_left = 3
            continue
        if tok in intensifiers:
            mult = intensifiers[tok]
            continue
        if tok in lexicon:
            s = lexicon[tok]
            if negate_left > 0:
                s = -s
            s = s * mult
            score += s
            mult = 1.0
            if negate_left > 0:
                negate_left -= 1
        else:
            # si no aporta, reseteamos multiplicador y avanzamos negación si toca
            mult = 1.0
            if negate_left > 0:
                negate_left -= 1
    return score

def label_from_scores(scores, tau_percentile=35):
    abs_s = np.abs(scores)
    tau = np.percentile(abs_s, tau_percentile) if len(abs_s) else 0.0
    labels = []
    for s in scores:
        if abs(s) < tau:
            labels.append("neu")
        elif s > 0:
            labels.append("pos")
        else:
            labels.append("neg")
    return labels, tau

scores = np.array([score_document(doc, lexicon, NEGATORS, INTENS) for doc in token_docs], dtype=float)
labels, tau = label_from_scores(scores, TAU_PERCENTILE)

pred_df = pd.DataFrame({"id": ids, "score": scores, "label_pred": labels})
pred_df.head(10)


Unnamed: 0,id,score,label_pred
0,0,0.0,neg
1,1,0.0,neg
2,2,0.0,neg
3,3,0.0,neg
4,4,0.0,neg
5,5,0.0,neg
6,6,0.0,neg
7,7,0.0,neg
8,8,0.0,neg
9,9,0.0,neg


In [7]:
# Resumen pequeño de resultados de sentimiento (sanity check)

summary = {
    "docs": len(pred_df),
    "tau": float(tau),
    "%neu": round((pred_df["label_pred"]=="neu").mean()*100, 2),
    "%pos": round((pred_df["label_pred"]=="pos").mean()*100, 2),
    "%neg": round((pred_df["label_pred"]=="neg").mean()*100, 2),
    "lexicon_size": len(lexicon)
}
summary


{'docs': 200000,
 'tau': 0.0,
 '%neu': np.float64(0.0),
 '%pos': np.float64(21.81),
 '%neg': np.float64(78.19),
 'lexicon_size': 190}

In [8]:
# Aspectos por dependecias
#   !python -m spacy download es_core_news_sm  (para español)

import spacy

model_name = "es_core_news_sm" if LANG == "es" else "en_core_web_sm"
try:
    nlp = spacy.load(model_name)
except Exception as e:
    # Intento de descarga automática (requiere internet). Si falla, te digo cómo instalar.
    try:
        from spacy.cli import download
        download(model_name)
        nlp = spacy.load(model_name)
    except Exception as e2:
        nlp = None
        print(f"No pude cargar '{model_name}'. Instala manualmente con: python -m spacy download {model_name}")

def extract_aspects_dep(texts, nlp, lexicon):
    """
    Reglas MUY sencillas:
      - amod: NOUN <- ADJ    -> aspecto=NOUN, opinión=ADJ
      - copular: ADJ con nsubj NOUN
      - obj: VERB(opinión) -> obj/dobj NOUN
    Sumamos +1/-1 al aspecto según polaridad en el léxico.
    """
    if nlp is None:
        # si no hay modelo, devolvemos vacío para no romper el notebook
        return pd.DataFrame(columns=["aspect","pos_count","neg_count","total"])

    counts = defaultdict(lambda: {"pos":0,"neg":0,"total":0})
    for doc in nlp.pipe(texts, disable=["ner"]):
        # 1) amod
        for tok in doc:
            if tok.dep_ == "amod" and tok.pos_ == "ADJ" and tok.head.pos_ == "NOUN":
                aspect = tok.head.lemma_.lower()
                opinion = tok.lemma_.lower()
                pol = lexicon.get(opinion, 0)
                if pol != 0:
                    if pol > 0: counts[aspect]["pos"] += 1
                    else:        counts[aspect]["neg"] += 1
                    counts[aspect]["total"] += 1

        # 2) copular (ADJ con sujeto nominal)
        for tok in doc:
            if tok.pos_ == "ADJ":
                subs = [c for c in tok.children if c.dep_.startswith("nsubj") and c.pos_ == "NOUN"]
                if subs:
                    aspect = subs[0].lemma_.lower()
                    opinion = tok.lemma_.lower()
                    pol = lexicon.get(opinion, 0)
                    if pol != 0:
                        if pol > 0: counts[aspect]["pos"] += 1
                        else:        counts[aspect]["neg"] += 1
                        counts[aspect]["total"] += 1

        # 3) verbo de opinión + objeto
        for tok in doc:
            if tok.pos_ == "VERB":
                opinion = tok.lemma_.lower()
                pol = lexicon.get(opinion, 0)
                if pol == 0:
                    continue
                objs = [c for c in tok.children if c.dep_ in ("obj","dobj") and c.pos_ in ("NOUN","PROPN")]
                for obj in objs:
                    aspect = obj.lemma_.lower()
                    if pol > 0: counts[aspect]["pos"] += 1
                    else:        counts[aspect]["neg"] += 1
                    counts[aspect]["total"] += 1

    rows = [{"aspect":a, "pos_count":d["pos"], "neg_count":d["neg"], "total":d["total"]} 
            for a,d in counts.items()]
    df_aspects = pd.DataFrame(rows).sort_values(["total","pos_count"], ascending=[False, False])
    return df_aspects

aspects_df = extract_aspects_dep(norm_texts, nlp, lexicon)
aspects_df.head(15)


Unnamed: 0,aspect,pos_count,neg_count,total
2,day,854,619,1473
14,time,480,360,840
46,thing,350,349,699
62,morning,530,49,579
84,night,365,170,535
121,friend,448,31,479
28,luck,312,53,365
347,weekend,204,91,295
25,sleep,264,29,293
344,mother,272,20,292
