In [1]:
import itertools
from transformers import pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def score_sequence(polka, words):
    sentence = ' '.join(words).capitalize() + '.'
    inputs = polka.tokenizer(sentence, return_tensors='pt')
    # loss (negative log-likelihood)
    with torch.no_grad():
        outputs = polka.model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss.item()
    return loss, sentence

def generate_sentences(words, polka):
    num_words = len(words)
    sentences = []

    #small - by hand
    if num_words <= 6:
        permutations = list(itertools.permutations(words))
    else:
        permutations = generate_intelligent_permutations(words, polka)

    for perm in permutations:
        score, sentence = score_sequence(polka, perm)
        sentences.append((score, sentence))

    # lower loss better
    sentences.sort()
    return sentences

def score_word_pair(polka, word1, word2):
    sentence = f"{word1} {word2}."
    inputs = polka.tokenizer(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = polka.model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss.item()
    return loss

def generate_clusters(words, polka, threshold=5.0):
    n = len(words)
    pair_scores = {}
    for i in range(n):
        for j in range(n):
            if i != j:
                word1 = words[i]
                word2 = words[j]
                loss = score_word_pair(polka, word1, word2)
                pair_scores[(word1, word2)] = loss

    clusters = []
    used_words = set()
    for (word1, word2), loss in sorted(pair_scores.items(), key=lambda x: x[1]):
        if loss < threshold:
            if word1 not in used_words and word2 not in used_words:
                clusters.append([word1, word2])
                used_words.update([word1, word2])
            elif word1 in used_words and word2 not in used_words:
                for cluster in clusters:
                    if word1 in cluster:
                        cluster.append(word2)
                        used_words.add(word2)
                        break
            elif word2 in used_words and word1 not in used_words:
                for cluster in clusters:
                    if word2 in cluster:
                        cluster.append(word1)
                        used_words.add(word1)
                        break
    for word in words:
        if word not in used_words:
            clusters.append([word])
            used_words.add(word)
    return clusters

def generate_intelligent_permutations(words, polka):
    clusters = generate_clusters(words, polka)

    permutations = []
    cluster_permutations = list(itertools.permutations(clusters))
    for cluster_perm in cluster_permutations:
        cluster_words_permutations = [list(itertools.permutations(cluster)) for cluster in cluster_perm]
        for clusters_perms in itertools.product(*cluster_words_permutations):
            perm = []
            for cluster in clusters_perms:
                perm.extend(cluster)
            permutations.append(perm)

    return permutations

In [3]:
polka = pipeline("text-generation", model="eryk-mazus/polka-1.1b")

def show(words):
    sentences = generate_sentences(words, polka)
    print("Generated Sentences (from most natural to least natural):")
    for score, sentence in sentences[:10]:  
        print(f"Score: {score:.4f}, Sentence: {sentence}")
    print("3 Worst sequences:")            
    for score, sentence in sentences[-3:]:
        print(f"Score: {score:.4f}, Sentence: {sentence}")

show(['babuleńka', 'miała', 'dwa', 'rogate', 'koziołki'])

show(['wiewiórki', 'w', 'parku', 'zaczepiają', 'przechodniów'])

# show(['wczoraj', 'wieczorem', 'spotkałem', 'pewną', 'wspaniałą', 'kobietę', 'która', 'z', 'pasją', 'opowiadała', 'o', 'modelach', 'językowych'])

show(['byłem', 'wczoraj', 'na', 'grzybach', 'w', 'lesie'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Generated Sentences (from most natural to least natural):
Score: 3.2727, Sentence: Rogate babuleńka miała dwa koziołki.
Score: 3.3887, Sentence: Babuleńka miała dwa rogate koziołki.
Score: 3.4266, Sentence: Babuleńka miała dwa koziołki rogate.
Score: 3.4925, Sentence: Dwa koziołki miała rogate babuleńka.
Score: 3.5161, Sentence: Rogate dwa koziołki miała babuleńka.
Score: 3.5230, Sentence: Dwa koziołki rogate babuleńka miała.
Score: 3.5257, Sentence: Dwa rogate koziołki miała babuleńka.
Score: 3.5581, Sentence: Dwa koziołki rogate miała babuleńka.
Score: 3.5789, Sentence: Babuleńka miała rogate dwa koziołki.
Score: 3.5881, Sentence: Rogate koziołki babuleńka miała dwa.
3 Worst sequences:
Score: 4.2164, Sentence: Miała koziołki babuleńka dwa rogate.
Score: 4.2347, Sentence: Babuleńka rogate koziołki dwa miała.
Score: 4.2357, Sentence: Babuleńka koziołki rogate dwa miała.
Generated Sentences (from most natural to least natural):
Score: 2.1428, Sentence: Wiewiórki w parku zaczepiają przec