In [None]:
# Tokenization

from transformers import AutoTokenizer
# To use tokenizers, we import them from the transformers library

# There are many available, use the ID of the model you want to use
# Qwen "Qwen/Qwen2-0.5B"
# GPT-2 "openai-community/gpt2"
# SmolLM "HuggingFaceTB/SomlLM-135M"

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
prompt = "It was a dark and stormy"
input_ids = tokenizer(prompt).input_ids
print(input_ids)





[2132, 572, 264, 6319, 323, 13458, 88]


In [None]:
# Print the token number for each word

for t in input_ids:
  print(t, "\t:", tokenizer.decode(t))

2132 	: It
572 	:  was
264 	:  a
6319 	:  dark
323 	:  and
13458 	:  storm
88 	: y


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

for token_id in range(0, 50):
  token = tokenizer.decode(token_id)
  print(token_id, "\t:", token)


0 	: !
1 	: "
2 	: #
3 	: $
4 	: %
5 	: &
6 	: '
7 	: (
8 	: )
9 	: *
10 	: +
11 	: ,
12 	: -
13 	: .
14 	: /
15 	: 0
16 	: 1
17 	: 2
18 	: 3
19 	: 4
20 	: 5
21 	: 6
22 	: 7
23 	: 8
24 	: 9
25 	: :
26 	: ;
27 	: <
28 	: =
29 	: >
30 	: ?
31 	: @
32 	: A
33 	: B
34 	: C
35 	: D
36 	: E
37 	: F
38 	: G
39 	: H
40 	: I
41 	: J
42 	: K
43 	: L
44 	: M
45 	: N
46 	: O
47 	: P
48 	: Q
49 	: R


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

text = "It was a dark and stormy"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits[0, -1]
    probs = torch.softmax(logits, dim=-1)

top_probs, top_ids = torch.topk(probs, 20)

print("Top 10 next WORDS:")
words = []
count = 0
for p, tid in zip(top_probs, top_ids):
    token = tokenizer.decode([tid.item()])
    if token.startswith(" ") and token.strip().isalpha():
        print(token.strip(), ":", round(float(p)*100, 2), "%")
        words.append(token.strip())
        count += 1
        if count == 10:
            break

print("\nP+7 word is:", words[6])

Top 10 next WORDS:
night : 46.18 %
day : 23.46 %
evening : 5.87 %
morning : 4.42 %
afternoon : 4.11 %
summer : 1.34 %
time : 1.33 %
winter : 1.22 %
weekend : 0.39 %
one : 0.34 %

P+7 word is: time


## ***P+7 technique***

In [48]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load GPT-2
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# The poem
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

lines = poem.split("\n")

# Function that returns the k-th most likely next word
def p_plus_k(prefix, k):
    inputs = tokenizer(prefix, return_tensors="pt")
    logits = model(**inputs).logits[0, -1]
    probs = torch.softmax(logits, dim=-1)
    # Take the most likely tokens
    top_ids = torch.topk(probs, 2000).indices

    # Keep only tokens that look like real words
    words = []
    for tid in top_ids:
        tok = tokenizer.decode([tid.item()])
        w = tok.strip()
        if tok.startswith(" ") and w.replace("-", "").replace("'", "").isalpha():
            words.append(w)
        if len(words) >= k:
            break

    return words[k-1]

# the value of k
K = 7

new_lines = []
# Replace the last word of each line using P+k
for line in lines:
    prefix = line.rsplit(" ", 1)[0] + " "
    new_word = p_plus_k(prefix, K)
    new_lines.append(prefix + new_word)

# Join all the modified lines back into a poem
result = "\n".join(new_lines)
print(result)

# Save the result
with open(f"P_plus_{K}.txt", "w") as f:
    f.write(result)

One must have a mind of for
To regard the frost and the Glac
Of the pine-trees crusted with and
And have been cold a long and
To behold the junipers shagged with Moines
The spruces rough in the distant Mountains
Of the January sun; and not to do
Of any misery in the sound of the sounds
In the sound of a few tuned
Which is the sound of the elevator
Full of the same though
That is blowing in the same bare of
For the listener, who listens in the Pond
And, nothing himself, did
Nothing that is not there and the nothing that happened


## ***P+39 technique***

In [49]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load GPT-2
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.eval()

# The poem
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

lines = poem.split("\n")

# Function: get the k-th most likely next word
def p_plus_k(prefix, k):
    inputs = tokenizer(prefix, return_tensors="pt")
    logits = model(**inputs).logits[0, -1]
    probs = torch.softmax(logits, dim=-1)
    # Take the most likely tokens
    top_ids = torch.topk(probs, 2000).indices

    # Keep only tokens that look like real words
    words = []
    for tid in top_ids:
        tok = tokenizer.decode([tid.item()])
        w = tok.strip()
        if tok.startswith(" ") and w.replace("-", "").replace("'", "").isalpha():
            words.append(w)
        if len(words) >= k:
            break

    return words[k-1]

# the value of k
K = 39

new_lines = []
# Replace the last word of each line using P+k
for line in lines:
    prefix = line.rsplit(" ", 1)[0] + " "
    new_word = p_plus_k(prefix, K)
    new_lines.append(prefix + new_word)

# Join all the modified lines back into a poem
result = "\n".join(new_lines)
print(result)

# Save the result
with open(f"P_plus_{K}.txt", "w") as f:
    f.write(result)

One must have a mind of grind
To regard the frost and the zombies
Of the pine-trees crusted with exactly
And have been cold a long tour
To behold the junipers shagged with used
The spruces rough in the distant Inner
Of the January sun; and not to to
Of any misery in the sound of the har
In the sound of a few practices
Which is the sound of the heard
Full of the same Gro
That is blowing in the same bare Hancock
For the listener, who listens in the ring
And, nothing himself, as
Nothing that is not there and the nothing that been
