# Laboratorio: Bag of Words para Desambiguación de "hard"

Este cuaderno construye vectores de características Bag of Words (BoW) para la palabra ambigua "hard", usando ejemplos simulados y clasificándolos según los sentidos definidos por WordNet/Senseval 2:

- **HARD1**: difícil o que requiere esfuerzo
- **HARD2**: físicamente duro o sólido
- **HARD3**: severo o estricto 
- **SERVE1**: Atender / prestar servicio
- **SERVE2**: Cumplir (un periodo), especialmente en contextos legales o laborales
- **SERVE3**: Realizar un saque (deporte, especialmente tenis o vóleibol)


In [None]:
# Importación de librerías necesarias
import nltk  # Biblioteca principal para procesamiento de lenguaje natural
from nltk.corpus import stopwords  # Lista de palabras vacías
import string  # Para eliminar puntuación
from collections import Counter  # Para contar frecuencias
import pandas as pd  # Para mostrar los vectores en forma de tabla
from nltk import pos_tag # Hara la extracción gramatical

# Descargar las stopwords si es necesario
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True



En esta sección se definen 10 contextos (oraciones) donde aparece la palabra ambigua **"hard"**, cada uno etiquetado con su sentido correcto según WordNet/Senseval 2:

- **HARD1**: difícil o que requiere esfuerzo.
- **HARD2**: físicamente duro o sólido.
- **HARD3**: severo o estricto.
- **SERVE1**: Atender / prestar servicio
- **SERVE2**: Cumplir (un periodo), especialmente en contextos legales o laborales
- **SERVE3**: Realizar un saque (deporte, especialmente tenis o vóleibol)


In [None]:
# Ejemplos optimizados con la palabra ambigua "hard" y sus sentidos
# HARD1: difícil o que requiere esfuerzo
# HARD2: físicamente duro o sólido
# HARD3: severo o estricto

# SERVE1: Atender / prestar servicio
# SERVE2: Cumplir (un periodo), especialmente en contextos legales o laborales
# SERVE3: Realizar un saque (deporte, especialmente tenis o vóleibol)


fake_contextsHard = [
    (["life", "is", "hard", "and", "full", "of", "difficult", "choices"], "HARD1"),
    (["he", "worked", "hard", "to", "find", "answers", "to", "the", "same", "questions"], "HARD1"),
    (["the", "man", "is", "hard", "to", "understand", "and", "hard", "to", "trust"], "HARD3"),
    (["she", "had", "a", "hard", "life", "and", "a", "strong", "character"], "HARD1"),
    (["those", "were", "very", "hard", "questions", "to", "answer"], "HARD1"),
    (["the", "wood", "felt", "hard", "and", "cold", "like", "stone"], "HARD2"),
    (["he", "was", "a", "hard", "man", "with", "a", "difficult", "past"], "HARD3"),
    (["the", "teacher", "gave", "us", "some", "hard", "questions", "with", "tricky", "meanings"], "HARD1"),
    (["many", "hard", "questions", "were", "asked", "during", "the", "interview"], "HARD1"),
    (["life", "can", "be", "hard", "but", "we", "must", "work", "through", "questions"], "HARD1")
]
fake_contextServer = [
    (["the", "waiter", "will", "serve", "the", "dinner", "shortly"], "SERVE1"),
    (["they", "serve", "fresh", "coffee", "every", "morning", "at", "the", "cafe"], "SERVE1"),
    (["the", "nurse", "will", "serve", "the", "patients", "during", "the", "night", "shift"], "SERVE1"),
    (["he", "had", "to", "serve", "two", "years", "in", "the", "army"], "SERVE2"),
    (["the", "criminal", "will", "serve", "five", "years", "in", "prison"], "SERVE2"),
    (["she", "is", "expected", "to", "serve", "as", "mayor", "for", "the", "next", "term"], "SERVE2"),
    (["the", "player", "was", "ready", "to", "serve", "the", "ball", "across", "the", "court"], "SERVE3"),
    (["she", "learned", "how", "to", "serve", "properly", "in", "her", "tennis", "class"], "SERVE3"),
    (["he", "prepared", "to", "serve", "with", "full", "strength", "during", "the", "match"], "SERVE3"),
    (["the", "coach", "told", "him", "to", "serve", "again", "because", "the", "first", "attempt", "was", "invalid"], "SERVE3")
]

Extraer y mostrar las formas gramaticales asociadas a "hard"

In [None]:
import nltk
import os

# Añadir ruta por si aún no está
if '/root/nltk_data' not in nltk.data.path:
    nltk.data.path.append('/root/nltk_data')

# Descargar el recurso actualizado de POS tagging
nltk.download('averaged_perceptron_tagger_eng', download_dir='/root/nltk_data')

# Verificar existencia del recurso descargado
tagger_path = '/root/nltk_data/taggers/averaged_perceptron_tagger_eng'
if not os.path.exists(tagger_path):
    raise RuntimeError("El recurso averaged_perceptron_tagger_eng no está disponible en la ruta esperada.")

# Importar POS tagger
from nltk import pos_tag

# Frases simuladas
fake_contexts = [
    (["life", "is", "hard", "and", "full", "of", "difficult", "choices"], "HARD1"),
    (["he", "worked", "hard", "to", "find", "answers", "to", "the", "same", "questions"], "HARD1"),
    (["the", "man", "is", "hard", "to", "understand", "and", "hard", "to", "trust"], "HARD3"),
    (["she", "had", "a", "hard", "life", "and", "a", "strong", "character"], "HARD1"),
    (["those", "were", "very", "hard", "questions", "to", "answer"], "HARD1"),
    (["the", "wood", "felt", "hard", "and", "cold", "like", "stone"], "HARD2"),
    (["he", "was", "a", "hard", "man", "with", "a", "difficult", "past"], "HARD3"),
    (["the", "teacher", "gave", "us", "some", "hard", "questions", "with", "tricky", "meanings"], "HARD1"),
    (["many", "hard", "questions", "were", "asked", "during", "the", "interview"], "HARD1"),
    (["life", "can", "be", "hard", "but", "we", "must", "work", "through", "questions"], "HARD1")
]
fake_contextServer = [
    (["the", "waiter", "will", "serve", "the", "dinner", "shortly"], "SERVE1"),
    (["they", "serve", "fresh", "coffee", "every", "morning", "at", "the", "cafe"], "SERVE1"),
    (["the", "nurse", "will", "serve", "the", "patients", "during", "the", "night", "shift"], "SERVE1"),
    (["he", "had", "to", "serve", "two", "years", "in", "the", "army"], "SERVE2"),
    (["the", "criminal", "will", "serve", "five", "years", "in", "prison"], "SERVE2"),
    (["she", "is", "expected", "to", "serve", "as", "mayor", "for", "the", "next", "term"], "SERVE2"),
    (["the", "player", "was", "ready", "to", "serve", "the", "ball", "across", "the", "court"], "SERVE3"),
    (["she", "learned", "how", "to", "serve", "properly", "in", "her", "tennis", "class"], "SERVE3"),
    (["he", "prepared", "to", "serve", "with", "full", "strength", "during", "the", "match"], "SERVE3"),
    (["the", "coach", "told", "him", "to", "serve", "again", "because", "the", "first", "attempt", "was", "invalid"], "SERVE3")
]

# Etiquetar y extraer formas gramaticales de "hard"
hard_forms = []
for sentence, _ in fake_contexts:
    tagged = pos_tag(sentence)
    for word, tag in tagged:
        if word.lower().startswith("hard"):
            hard_forms.append((word, tag))

# Etiquetar y extraer formas gramaticales de "SERVE"
serve_forms = []
for sentence, _ in fake_contextServer:
    tagged = pos_tag(sentence)
    for word, tag in tagged:
        if word.lower().startswith("serve"):
            serve_forms.append((word, tag))

# Mostrar resultados únicos
unique_hard_forms = sorted(set(hard_forms))
print("Formas gramaticales diferentes encontradas para 'hard':")
for word, tag in unique_hard_forms:
    print(f"- {word} (POS tag: {tag})")


# Mostrar resultados únicos Serve
unique_Serve_forms = sorted(set(serve_forms))
print("Formas gramaticales diferentes encontradas para 'Serve':")
for wordS, tagS in unique_Serve_forms:
    print(f"- {wordS} (POS tag: {tagS})")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Formas gramaticales diferentes encontradas para 'hard':
- hard (POS tag: JJ)
- hard (POS tag: VB)


Construcción del vocabulario más frecuente (top 8 palabras)

In [None]:
# Función para limpiar palabras: minúsculas y sin puntuación
def clean_word(word):
    return word.lower().strip(string.punctuation)

# Definición de palabras vacías y formas de "hard" a excluir
stop_words = set(stopwords.words('english'))
ambiguous_forms = {"hard", "harder", "hardest"}
ambiguous_formsServe = {"serve", "serves", "served", "serving"}

# Construcción del vocabulario más frecuente (top 8 palabras)
all_words = []
all_wordsServer = []
for context, _ in fake_contexts:
    for word in context:
        word_clean = clean_word(word)
        if word_clean not in stop_words and word_clean not in ambiguous_forms:
            all_words.append(word_clean)

for context, _ in fake_contextServer:
    for word in context:
        word_clean = clean_word(word)
        if word_clean not in stop_words and word_clean not in ambiguous_formsServe:
            all_wordsServer.append(word_clean)


vocab = [word for word, _ in Counter(all_words).most_common(8)]
print("Vocabulario usado para el vector Bag of Words para hard:", vocab)

vocabServer = [word for word, _ in Counter(all_wordsServer).most_common(8)]
print("Vocabulario usado para el vector Bag of Words para Serve:", vocabServer)

Vocabulario usado para el vector Bag of Words: ['questions', 'life', 'difficult', 'man', 'full', 'choices', 'worked', 'find']


In [None]:
# Función para construir el vector BoW para una oración Hard
def extract_bow_features(instance_words, vocab):
    instance_set = set([clean_word(w) for w in instance_words])
    return {f'contains({w})': (w in instance_set) for w in vocab}

# Función para construir el vector BoW para una oración Serve
def extract_bow_features_S(instance_words, vocab):
    instance_setSer = set([clean_word(w) for w in instance_words])
    return {f'contains({w})': (w in instance_setSer) for w in vocab}

In [None]:
# Construcción e impresión de vectores para cada instancia
all_features = []
all_featuresS = []
print("\nVectores de características (Bag of Words):\n")
for context, label in fake_contexts:
    features = extract_bow_features(context, vocab)
    print(f"Etiqueta: {label}")
    print(features, "\n")
    all_features.append((features, label))

# Server work
for context, label in fake_contextServer:
    features = extract_bow_features_S(context, vocab)
    print(f"Etiqueta: {label}")
    print(features, "\n")
    all_featuresS.append((features, label))


Vectores de características (Bag of Words):

Etiqueta: HARD1
{'contains(questions)': False, 'contains(life)': True, 'contains(difficult)': True, 'contains(man)': False, 'contains(full)': True, 'contains(choices)': True, 'contains(worked)': False, 'contains(find)': False} 

Etiqueta: HARD1
{'contains(questions)': True, 'contains(life)': False, 'contains(difficult)': False, 'contains(man)': False, 'contains(full)': False, 'contains(choices)': False, 'contains(worked)': True, 'contains(find)': True} 

Etiqueta: HARD3
{'contains(questions)': False, 'contains(life)': False, 'contains(difficult)': False, 'contains(man)': True, 'contains(full)': False, 'contains(choices)': False, 'contains(worked)': False, 'contains(find)': False} 

Etiqueta: HARD1
{'contains(questions)': False, 'contains(life)': True, 'contains(difficult)': False, 'contains(man)': False, 'contains(full)': False, 'contains(choices)': False, 'contains(worked)': False, 'contains(find)': False} 

Etiqueta: HARD1
{'contains(ques

In [None]:
# Mostrar todos los vectores en forma de tabla
df = pd.DataFrame([f for f, _ in all_features])
df['Etiqueta'] = [label for _, label in all_features]
df

# Mostrar todos los vectores en forma de tabla
df = pd.DataFrame([f for f, _ in all_featuresS])
df['Etiqueta'] = [label for _, label in all_featuresS]
df

Unnamed: 0,contains(questions),contains(life),contains(difficult),contains(man),contains(full),contains(choices),contains(worked),contains(find),Etiqueta
0,False,True,True,False,True,True,False,False,HARD1
1,True,False,False,False,False,False,True,True,HARD1
2,False,False,False,True,False,False,False,False,HARD3
3,False,True,False,False,False,False,False,False,HARD1
4,True,False,False,False,False,False,False,False,HARD1
5,False,False,False,False,False,False,False,False,HARD2
6,False,False,True,True,False,False,False,False,HARD3
7,True,False,False,False,False,False,False,False,HARD1
8,True,False,False,False,False,False,False,False,HARD1
9,True,True,False,False,False,False,False,False,HARD1


In [None]:
from collections import Counter

# usa la variable que está definida en el notebook (fake_contexts o fake_contextsHard)
hard_list = globals().get('fake_contextsHard', globals().get('fake_contexts', []))
serve_list = globals().get('fake_contextServer', globals().get('fake_contexts', []))

print("HARD counts:", Counter([label for _, label in hard_list]))
print("SERVE counts:", Counter([label for _, label in serve_list]))

# Código de validación (ejecutar en el notebook)
def check_list(name):
    if name not in globals():
        print(f"{name}: no existe")
        return
    lst = globals()[name]
    bad = []
    for i, inst in enumerate(lst):
        if not (isinstance(inst, tuple) and len(inst)==2 and isinstance(inst[0], list) and isinstance(inst[1], str)):
            bad.append((i, inst))
    print(f"{name}: total={len(lst)} malformed={len(bad)}")
    for i, inst in bad[:5]:
        print(" example:", i, inst)

for name in ['fake_contextsHard','fake_contexts','fake_contextServer','fake_contextServer']:
    check_list(name)

# Normalizar para evitar problemas con nombres:
if 'fake_contexts' not in globals() and 'fake_contextsHard' in globals():
    fake_contexts = fake_contextsHard

if 'fake_contextServer' not in globals() and 'fake_contextServer' in globals():
    fake_contextServer = fake_contextServer

In [None]:

def classify():
    import random
    import nltk
    from nltk import ConfusionMatrix

    # extractor de bigramas de contexto (n=2)
    def extract_context_bigrams(instance_words, target_forms, n=2):
        words = [clean_word(w) for w in instance_words]
        features = {}
        for i, w in enumerate(words):
            if w in target_forms:
                # antes bigram
                if i - n >= 0:
                    seq = "_".join(words[i-n:i])
                    features[f"Antes:{seq}"] = True
                # despues bigram
                if i + n < len(words):
                    seq = "_".join(words[i+1:i+1+n])
                    features[f"Despues:{seq}"] = True
        return features

    # Preparar datasets (usar BoW ya calculado si existe, y crear bigram datasets)
    bow_hard = globals().get('all_features', [])
    bow_serve = globals().get('all_featuresS', [])

    bigram_hard = [(extract_context_bigrams(ctx, ambiguous_forms, n=2), label)
                   for ctx, label in globals().get('fake_contexts', [])]
    bigram_serve = [(extract_context_bigrams(ctx, ambiguous_formsServe, n=2), label)
                    for ctx, label in globals().get('fake_contextServer', [])]

    def train_eval_nltk(feature_label_list, name):
        if not feature_label_list:
            print(f"{name}: no hay instancias.")
            return None
        random.seed(42)
        data = list(feature_label_list)
        random.shuffle(data)
        split = max(1, int(0.7 * len(data)))
        train_set = data[:split]
        test_set = data[split:]
        if not test_set:
            test_set = train_set  # fallback si pocas instancias

        clf = nltk.NaiveBayesClassifier.train(train_set)
        acc = nltk.classify.accuracy(clf, test_set)
        y_test = [label for _, label in test_set]
        y_pred = [clf.classify(feat) for feat, _ in test_set]
        cm = ConfusionMatrix(y_test, y_pred)

        print(f"\n==== {name} ====")
        print("Accuracy:", round(acc, 4))
        print("\nMost informative features:")
        try:
            clf.show_most_informative_features(10)
        except Exception:
            pass
        print("\nConfusion matrix:")
        print(cm)
        return {"clf": clf, "acc": acc, "cm": cm, "test": test_set}

    # Entrenar y evaluar
    results = {}
    results['HARD_BoW'] = train_eval_nltk(bow_hard, "HARD - BoW")
    results['HARD_Bigrams'] = train_eval_nltk(bigram_hard, "HARD - Context Bigrams (n=2)")
    results['SERVE_BoW'] = train_eval_nltk(bow_serve, "SERVE - BoW")
    results['SERVE_Bigrams'] = train_eval_nltk(bigram_serve, "SERVE - Context Bigrams (n=2)")

    # función auxiliar de predicción
    def predict_with(model_info, feat_dict):
        if model_info is None:
            return None
        return model_info['clf'].classify(feat_dict)

    # Ejemplos de predicción
    example_hard = ["the","teacher","gave","a","hard","question","to","solve"]
    example_serve = ["the","player","was","ready","to","serve","the","ball"]

    print("\nEjemplo predicción HARD (bigrams):",
          predict_with(results['HARD_Bigrams'], extract_context_bigrams(example_hard, ambiguous_forms)))
    print("Ejemplo predicción SERVE (bigrams):",
          predict_with(results['SERVE_Bigrams'], extract_context_bigrams(example_serve, ambiguous_formsServe)))

    return results

classify()