In [1]:
import torch
from transformers import GPTNeoModel, GPT2Tokenizer

# Load the GPT-Neo-125M model
model = GPTNeoModel.from_pretrained("EleutherAI/gpt-neo-125M")

# Tokenizer for GPT-Neo
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

# Identify a suitable split point (e.g., after 6 transformer blocks out of 12)
split_point = 6

# Define Part 1 of the model (embedding layers + first few blocks)
class GPTNeoPart1(torch.nn.Module):
    def __init__(self, original_model, split_point):
        super().__init__()
        self.wte = original_model.wte  # Word embeddings
        self.h = torch.nn.ModuleList(original_model.h[:split_point])  # First half of the transformer blocks

    def forward(self, input_ids, attention_mask=None):
        # Compute embeddings
        input_embeds = self.wte(input_ids)
        hidden_states = input_embeds

        # Pass through the transformer blocks in part 1
        for block in self.h:
            hidden_states = block(hidden_states, attention_mask=attention_mask)[0]  # Extract hidden states
        return hidden_states


# Define Part 2 of the model (remaining blocks + final layer norm)
class GPTNeoPart2(torch.nn.Module):
    def __init__(self, original_model, split_point):
        super().__init__()
        self.h = torch.nn.ModuleList(original_model.h[split_point:])  # Remaining transformer blocks
        self.ln_f = original_model.ln_f  # Final layer norm

    def forward(self, hidden_states, attention_mask=None):
        # Pass through the remaining transformer blocks
        for block in self.h:
            hidden_states = block(hidden_states, attention_mask=attention_mask)[0]  # Extract hidden states

        # Apply final normalization
        output_states = self.ln_f(hidden_states)
        return output_states


# Create the split models
model_part1 = GPTNeoPart1(model, split_point)
model_part2 = GPTNeoPart2(model, split_point)

# Save the split models
torch.save(model_part1.state_dict(), "gpt_neo_part1.pth")
torch.save(model_part2.state_dict(), "gpt_neo_part2.pth")
print("Models saved successfully!")

# Reload the models for inference
model_part1 = GPTNeoPart1(model, split_point)
model_part1.load_state_dict(torch.load("gpt_neo_part1.pth"))
model_part1.eval()

model_part2 = GPTNeoPart2(model, split_point)
model_part2.load_state_dict(torch.load("gpt_neo_part2.pth"))
model_part2.eval()
print("Models loaded successfully!")


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Models saved successfully!


  model_part1.load_state_dict(torch.load("gpt_neo_part1.pth"))
  model_part2.load_state_dict(torch.load("gpt_neo_part2.pth"))


Models loaded successfully!


In [6]:
# Example input text
text = "How are you?"
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]

# Part 1 inference
hidden_states = model_part1(input_ids)

# Part 2 inference
output_states = model_part2(hidden_states)

# Print final output shape
print("Final output shape:", output_states.shape)



Final output shape: torch.Size([1, 4, 768])


In [19]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
causal_model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

# Example input text
text = "How are you?"
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]

# Generate text using the full model
generated_ids = causal_model.generate(
    input_ids=input_ids,
    max_length=50,            # Set max tokens in the response
    temperature=0.7,          # Sampling temperature for randomness
    top_k=50,                 # Top-k sampling
    top_p=0.9,                # Nucleus sampling (cumulative probability)
    repetition_penalty=1.2,   # Penalize repeated tokens
    do_sample=True,           # Enable sampling
)

# Decode generated token IDs into text
decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated text:", decoded_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text: How are you?

You have been diagnosed with a serious neurological condition, known as "waking eye syndrome". This is when your vision starts to fail and you have trouble seeing or seeing things. You may have a blurred vision, or even


In [26]:
import torch
from transformers import GPTNeoModel, GPT2Tokenizer

# ----- 1) Load base model & tokenizer -----
base_model_name = "EleutherAI/gpt-neo-125M"
base_model = GPTNeoModel.from_pretrained(base_model_name)
tokenizer = GPT2Tokenizer.from_pretrained(base_model_name)

# Some GPT-Neo tokenizers lack a pad_token, so reuse eos_token:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ----- 2) Define split modules -----
class GPTNeoPart1(torch.nn.Module):
    def __init__(self, original_model, split_point):
        super().__init__()
        self.wte = original_model.wte  # word embedding
        self.h = torch.nn.ModuleList(original_model.h[:split_point])  # first blocks

    def forward(self, input_ids):
        # input_ids shape: [batch_size, seq_len]
        inputs_embeds = self.wte(input_ids)  # [batch_size, seq_len, hidden_dim]
        hidden_states = inputs_embeds
        for block in self.h:
            hidden_states = block(hidden_states)[0]
        return hidden_states

class GPTNeoPart2(torch.nn.Module):
    def __init__(self, original_model, split_point):
        super().__init__()
        self.h = torch.nn.ModuleList(original_model.h[split_point:])  # remaining blocks
        self.ln_f = original_model.ln_f  # final layer norm

        hidden_size = original_model.config.hidden_size
        vocab_size = original_model.config.vocab_size

        # Language modeling head
        self.lm_head = torch.nn.Linear(hidden_size, vocab_size, bias=False)
        # tie lm_head weights to the input embedding (wte)
        self.lm_head.weight = original_model.wte.weight

    def forward(self, hidden_states):
        for block in self.h:
            hidden_states = block(hidden_states)[0]
        hidden_states = self.ln_f(hidden_states)
        logits = self.lm_head(hidden_states)  # [batch_size, seq_len, vocab_size]
        return logits

# ----- 3) Instantiate the split model parts -----
split_point = 6  # for GPT-Neo-125M, which has 12 total blocks
model_part1 = GPTNeoPart1(base_model, split_point).eval()
model_part2 = GPTNeoPart2(base_model, split_point).eval()

# ----- 4) Autoregressive generation with top-k / top-p sampling -----
@torch.no_grad()
def generate_autoregressive(
    model1,
    model2,
    tokenizer,
    prompt,
    max_new_tokens=50,
    top_k=50,
    top_p=0.9,
    temperature=1.0
):
    """
    Autoregressively generates text using two-part GPT-Neo.
    """
    # Tokenize the initial prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids  # shape: [1, seq_len]
    generated_ids = input_ids.clone()

    for _ in range(max_new_tokens):
        # 1) Pass full sequence through Part1
        hidden_states = model1(generated_ids)
        # 2) Pass the resulting hidden states to Part2 for logits
        logits = model2(hidden_states)  # [1, seq_len, vocab_size]

        # Focus on the last token's logits
        next_token_logits = logits[:, -1, :]  # [1, vocab_size]

        # -- Apply temperature --
        if temperature != 1.0:
            next_token_logits = next_token_logits / temperature

        # -- Top-k filtering --
        if top_k is not None and top_k > 0:
            top_k = min(top_k, next_token_logits.size(-1))  # Safety
            # Get top_k logits
            values_to_keep, _ = torch.topk(next_token_logits, top_k)
            min_val = values_to_keep[0, -1]  # smallest logit in top_k
            next_token_logits[next_token_logits < min_val] = -float('Inf')

        # -- Top-p (nucleus) filtering --
        if top_p is not None and top_p < 1.0:
            sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
            cumulative_probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
            # find cutoff index
            cutoff_idx = torch.sum(cumulative_probs <= top_p).item()
            # set everything after cutoff to -Inf
            if cutoff_idx < sorted_logits.size(-1):
                sorted_logits[0, cutoff_idx+1:] = -float('Inf')
            # map back
            next_token_logits.fill_(-float('Inf'))
            next_token_logits.scatter_(1, sorted_indices, sorted_logits)

        # -- Sample from the filtered distribution --
        probs = torch.softmax(next_token_logits, dim=-1)
        next_token_id = torch.multinomial(probs, num_samples=1)  # shape: [1, 1]
        
        # Append next token
        generated_ids = torch.cat([generated_ids, next_token_id], dim=1)

        # OPTIONAL: stop at EOS if desired
        if tokenizer.eos_token_id is not None and next_token_id.item() == tokenizer.eos_token_id:
            break

    # Decode everything (prompt + newly generated tokens)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# ----- 5) Run generation on a single prompt -----
# Use a prompt with enough context to steer the model:
prompt_text = (
    "Explain why ChatGPT sometimes makes mistakes. "
    "Consider the limitations of large language models and training data."
)
output_text = generate_autoregressive(
    model_part1,
    model_part2,
    tokenizer,
    prompt_text,
    max_new_tokens=100,
    top_k=50,
    top_p=0.9,
    temperature=1.0
)

print("PROMPT:")
print(prompt_text)
print("\nGENERATED TEXT:")
print(output_text)


PROMPT:
Explain why ChatGPT sometimes makes mistakes. Consider the limitations of large language models and training data.

GENERATED TEXT:
Explain why ChatGPT sometimes makes mistakes. Consider the limitations of large language models and training data.
 as






 * �
 " (
 " ( in

 � (




 (






 �
 (� (




 (


 �


 * * * � in



 * �





 ( ( ( ( �
 * �
 (
 �: �
� * �
 �


 � in�
 � �

