In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [2]:
!pip install python-Levenshtein
# 🔹 Cargar BabyLLaMA
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# ⬛ Instalar dependencias
!pip install -q transformers accelerate bitsandbytes datasets sentencepiece tabulate python-Levenshtein



# ⬛ Importar librerías necesarias
import re
import pandas as pd
from tabulate import tabulate
from Levenshtein import distance as levenshtein_distance

# 🔹 Vocabulario inicial de estímulos
vocabulary = [
    {'Estímulo': {'shape': 1, 'colour': 'blue', 'amount': 2}},
    {'Estímulo': {'shape': 1, 'colour': 'purple', 'amount': 4}},
    {'Estímulo': {'shape': 1, 'colour': 'brown', 'amount': 2}},
    {'Estímulo': {'shape': 1, 'colour': 'cyan', 'amount': 1}},
    {'Estímulo': {'shape': 1, 'colour': 'purple', 'amount': 1}},
    {'Estímulo': {'shape': 2, 'colour': 'green', 'amount': 1}},
    {'Estímulo': {'shape': 2, 'colour': 'pink', 'amount': 3}},
    {'Estímulo': {'shape': 2, 'colour': 'gray', 'amount': 5}},
    {'Estímulo': {'shape': 3, 'colour': 'orange', 'amount': 3}},
    {'Estímulo': {'shape': 3, 'colour': 'red', 'amount': 2}},
    {'Estímulo': {'shape': 3, 'colour': 'yellow', 'amount': 1}},
    {'Estímulo': {'shape': 3, 'colour': 'gold', 'amount': 1}},
    {'Estímulo': {'shape': 4, 'colour': 'pink', 'amount': 1}},
]

vocabulario_tiny = []
DISTANCIA_MINIMA = 4

# 🔹 Función principal de generación
def generar_respuesta_llm(mensaje):
    vocab_existente = ""
    for entrada in vocabulario_tiny:
        stim = entrada['Estímulo']
        palabra = entrada['palabra']
        vocab_existente += f"- Stimulus: shape={stim['shape']}, colour={stim['colour']}, amount={stim['amount']} → \"{palabra}\"\n"

    prompt = f"""
 You are a language learner who has to learn an artificial language with words and their corresponding features.
 Your task is to complete the vocabulary by generating a word that describes the last item. Only respond with the word.
 The Vocabulary so far:
 {vocab_existente}

  Current stimulus:
  shape={mensaje['shape']}, colour={mensaje['colour']}, amount={mensaje['amount']}

  Word:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=inputs['input_ids'].shape[1] + 10, do_sample=False)
    palabra_raw = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extraer solo la palabra generada
    palabra = palabra_raw.split("Word:")[-1].strip().split("\n")[0]
    palabra = palabra.lower()
    palabra = re.sub(r'[^a-z]', '', palabra)
    return palabra

# 🔹 Verifica si la nueva palabra es válida
def es_valida(palabra, palabras_usadas):
    if palabra in palabras_usadas:
        return False
    for existente in palabras_usadas:
        if levenshtein_distance(palabra, existente) < DISTANCIA_MINIMA:
            return False
    return True

# 🔹 Genera y asigna palabra única
def comunicar_agentes_y_guardar(mensaje):
    intentos = 0
    max_intentos = 5
    palabra = None
    palabras_usadas = {entry['palabra'] for entry in vocabulario_tiny}

    while intentos < max_intentos:
        posible = generar_respuesta_llm(mensaje)
        if posible and es_valida(posible, palabras_usadas):
            palabra = posible
            break
        intentos += 1

    if palabra is None:
        palabra = f"stim{len(vocabulario_tiny)}"  # fallback

    vocabulario_tiny.append({'Estímulo': mensaje, 'palabra': palabra})

# 🔹 Procesar todo el vocabulario inicial
for item in vocabulary:
    comunicar_agentes_y_guardar(item['Estímulo'])

# 🔹 Mostrar resultados
df_vocab = pd.DataFrame([
    {
        'Shape': v['Estímulo']['shape'],
        'Colour': v['Estímulo']['colour'],
        'Amount': v['Estímulo']['amount'],
        'Generated Word': v['palabra']
    } for v in vocabulario_tiny
])

print("\nGENERATED ARTIFICIAL VOCABULARY")
print("================================")
print(tabulate(df_vocab, headers="keys", tablefmt="pretty", showindex=False))


KeyboardInterrupt: 

In [10]:
# Usamos un modelo ligero: EleutherAI/gpt-neo-125M
# model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

# Definición de un vocabulario de ejemplo (esto simula la parte de "etiquetado" previo)
vocabulary = [
    {"Estímulo": {"shape": 1, "colour": "blue",   "amount": 2}, "palabra": "sunupepi"},
    {"Estímulo": {"shape": 2, "colour": "green",  "amount": 1}, "palabra": "ginisu"},
    {"Estímulo": {"shape": 3, "colour": "orange", "amount": 3}, "palabra": "wipipitite"},
    {"Estímulo": {"shape": 4, "colour": "red",    "amount": 2}, "palabra": "kilatu"},
    {"Estímulo": {"shape": 5, "colour": "yellow", "amount": 1}, "palabra": "lumofe"},
    {"Estímulo": {"shape": 6, "colour": "purple", "amount": 4}, "palabra": "zivuro"},
    {"Estímulo": {"shape": 7, "colour": "pink",   "amount": 3}, "palabra": "noripa"},
    {"Estímulo": {"shape": 8, "colour": "brown",  "amount": 2}, "palabra": "tebokri"},
    {"Estímulo": {"shape": 9, "colour": "cyan",   "amount": 1}, "palabra": "fexanu"},
    {"Estímulo": {"shape": 10, "colour": "gray",  "amount": 5}, "palabra": "dolimexi"}
]

# Función que construye el prompt basado en el vocabulario y el estímulo a adivinar
def build_prompt(vocab, test_stimulus):
    vocab_existente = ""
    for entrada in vocabulary:
        stim = entrada['Estímulo']
        palabra = entrada['palabra']
        vocab_existente += f"- Stimulus: shape={stim['shape']}, colour={stim['colour']}, amount={stim['amount']} → \"{palabra}\"\n"

    prompt = f"""
You are a language creation expert. Your task is to invent a **new word** based on the following stimulus.

**Rules:**
- The word must be a **combination of the letters**: p, g, a, and e.
- You only response with de NEW word

**Stimulus:**
- Shape: {test_stimulus['shape']}
- Colour: {test_stimulus['colour']}
- Amount: {test_stimulus['amount']}

**Your Response:
"""

    print(prompt)
    return prompt

# Función que utiliza el modelo para "adivinar" la señal del estímulo
def guess_signal(test_stimulus):
    prompt = build_prompt(vocabulary, test_stimulus)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    # Generamos la respuesta sin muestreo (modo determinista)
    outputs = model.generate(**inputs, max_length=inputs['input_ids'].shape[1] + 10, do_sample=False)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extraemos la respuesta que sigue a "Respuesta:"
    answer = generated.split("Response:")[-1].strip().split("\n")[0].strip()
    return answer

# Ejemplo: estímulo de prueba
test_stimulus ={"shape": 1, "colour": "blue",  "amount": 2}
predicted_signal = guess_signal(test_stimulus)
print("---> Señal predicha:", predicted_signal)

test_stimulus ={"shape": 2, "colour": "green", "amount": 1}
predicted_signal = guess_signal(test_stimulus)
print("---> Señal predicha:", predicted_signal)


You are a language creation expert. Your task is to invent a **new word** based on the following stimulus.

**Rules:**
- The word must be a **combination of the letters**: p, g, a, and e.
- You only response with de NEW word

**Stimulus:**
- Shape: 1
- Colour: blue
- Amount: 2

**Your Response:

---> Señal predicha: - New word: PEGA

You are a language creation expert. Your task is to invent a **new word** based on the following stimulus.

**Rules:**
- The word must be a **combination of the letters**: p, g, a, and e.
- You only response with de NEW word

**Stimulus:**
- Shape: 2
- Colour: green
- Amount: 1

**Your Response:

---> Señal predicha: - De new word: **g**l**
