In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

In [8]:
def predict_code(tokenizer, model, input_code, device='cpu'):
    model.to(device)
    input_ids = tokenizer.encode(input_code, return_tensors='pt').to(device)
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=100)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

def calculate_throughput(tokenizer, model, input_code, device='cpu', num_runs=1):
    model.to(device)
    input_ids = tokenizer.encode(input_code, return_tensors='pt').to(device)
    start_time = time.time()
    with torch.no_grad():
        for _ in range(num_runs):
            _ = model.generate(input_ids, max_length=100)
    end_time = time.time()
    throughput = num_runs / (end_time - start_time)
    return throughput

In [3]:
input_code = "def hello_world():"

In [5]:
tokenizer = AutoTokenizer.from_pretrained("budecosystem/code-millenials-1b", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("marco-molinari/python-code-millenials-1b", trust_remote_code=True)
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/688M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2048)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-23): 24 x ParallelBlock(
        (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (Wqkv): Linear(in_features=2048, out_features=6144, bias=True)
          (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
        )
        (mlp): MLP(
          (fc1): Linear(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=2048, bias=True)
          (act): NewGELUActivation()
        )
      )
    )
  )
  (lm_h

In [12]:
completed_code = predict_code(tokenizer, model, input_code)
print("Predicted Code:", completed_code)

Predicted Code: def hello_world():
    return "Hello, World!"

if __name__ == "__main__":
    print(hello_world())
```

In this example, the `hello_world` function is defined to return the string "Hello, World!". The `if __name__ == "__main__":` block is used to ensure that the function is only executed when the script is run directly (i.e., not imported as a module
