# Functions

In [41]:
import os
import json 


# FUNCTIONS FOR GENERAL USE

def load_examples_from_json(file_path):
    """
    Loads examples from a JSON file and returns them as a list of dictionaries.
    
    Parameters:
    - file_path (str): Path to the JSON file containing examples.
    
    Returns:
    - examples (list): A list of dictionaries, each with 'prefix', 'middle', and 'suffix'.
    """
    with open(file_path, "r") as f:
        examples = json.load(f)
    return examples


# FUNCTIONS FOR DATASET CREATION

def load_code_files(directory_path, file_extension=".py"):
    """
    Loads code files from a specified directory and returns their content along with the count of files.

    Parameters:
    ---
    - directory_path (str): Path to the directory containing code files.
    - file_extension (str): File extension to filter code files, default is ".py".

    Returns:
    ---
    - tuple: A tuple containing:
        - code_snippets (list of str): List of code content strings from each file.
        - files_num (int): Number of files processed and loaded.

    Example:
    ---
    >>> code_snippets, files_num = load_code_files("path/to/directory")
    >>> print(files_num)
    3  # Assuming there were 3 files with .py extension
    """
    code_snippets = []
    files_num = 0
    for filename in os.listdir(directory_path):
        if filename.endswith(file_extension):
            with open(os.path.join(directory_path, filename), "r") as f:
                code_snippets.append(f.read())
            files_num += 1
    return code_snippets, files_num

def adjust_empty_sections(prefix, middle, suffix):
    """
    Adjusts the prefix, middle, and suffix sections of code to ensure none are empty.
    
    If `middle` is empty, it moves one line from `suffix` to `middle` and removes this line from `suffix`.
    If `suffix` is empty, it shifts one line up from `prefix` to `middle` and from `middle` to `suffix`,
    ensuring all three sections are populated if possible.

    Parameters:
    ---
    - prefix (str): The prefix section of code, representing content before the cursor.
    - middle (str): The middle section, representing code where a user might expect completion.
    - suffix (str): The suffix section of code, representing content after the cursor.

    Returns:
    ---
    - tuple: A tuple containing the adjusted `prefix`, `middle`, and `suffix` strings.

    Example:
    ---
    >>> prefix, middle, suffix = adjust_empty_sections("line1\nline2", "", "line3\nline4")
    >>> print(middle)
    "line3"  # The first line from suffix moves to middle
    """
    # Split prefix, middle, and suffix into lines for easier manipulation
    prefix_lines = prefix.splitlines()
    suffix_lines = suffix.splitlines()

    # If middle is empty, add one line from suffix and remove it from suffix
    if not middle.strip() and suffix_lines:
        middle = suffix_lines.pop(0)
        suffix = "\n".join(suffix_lines)
    
    # If suffix is empty, shift lines up to fill suffix and middle from prefix
    if not suffix.strip():
        # If possible, add the last line of prefix to middle
        if prefix_lines:
            middle = prefix_lines.pop()
            prefix = "\n".join(prefix_lines)
    
    return prefix, middle, suffix


# FUNCTIONS FOR MODEL APPLICATION 

import torch

def generate_completion(model, tokenizer, device, prefix, suffix, max_new_tokens=2000):
    """Generate code completion for the missing middle part given prefix and suffix."""
    
    # Format the input using the <fim_prefix>, <fim_suffix>, and <fim_middle> tokens
    input_text = f"<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>"
    
    # Encode the input and move it to the device (if required by model setup)
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Generate the completion without specifying max_length, but limiting new tokens
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,  # Limit only the generated tokens
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode the output and return only the generated middle part
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    print(generated_text)
    print(type(generated_text))
    # Remove prefix and suffix to get only the generated middle part
    start_index = generated_text.find(prefix) + len(prefix)
    end_index = generated_text.find(suffix, start_index)
    
    middle_text = generated_text[generated_text.find("<fim_middle>") + len("<fim_middle>"): len(generated_text) - len("<|endoftext|>")]
    return middle_text


# FUNCTIONS FOR EVALUATIONS

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.chrf_score import sentence_chrf


def exact_match(predicted, actual):
    """Check if the predicted text matches the actual text exactly."""
    return int(predicted.strip() == actual.strip())

def compute_bleu(predicted, actual):
    """Compute BLEU score."""
    weights = [
        (1.),
        (1./2., 1./2.),
        (1./3., 1./3., 1./3.),
        (1./4., 1./4., 1./4., 1./4.)]
    return sentence_bleu([actual.split()], predicted.split(), weights=weights)

def compute_chrf(predicted, actual):
    """Compute ChrF score."""
    return sentence_chrf([actual], predicted)

# Run the Model

In [42]:
import os
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm 

# Disable parallelism in tokenizers
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigcode/tiny_starcoder_py")

model = AutoModelForCausalLM.from_pretrained("bigcode/tiny_starcoder_py")

tokenizer.add_special_tokens({"pad_token": "[PAD]"})
# Load the data
examples = load_examples_from_json("data/code/code.json")

examples = examples[:2]
print(examples[0])

# Generate completions
for example in tqdm(examples):
    example["generated_middle"], outputs, generated_text, start_index, end_index = generate_completion(model, tokenizer, model.device, example["prefix"], example["suffix"])

print(examples[0])

# Optionally, save examples to a JSON file
with open("data/code/code.json", "w") as f:
    json.dump(examples, f, indent=4)

{'prefix': 'import torch \nimport sys \nimport os\n\nimport torch.nn as nn \nfrom config import device\n# Hyperparameters \nsequence_length = 28 \n\n# create RNN\nclass RNN(nn.Module):\n    def __init__(self, input_size, hidden_size, num_layers, num_classes):\n        super(RNN, self).__init__()\n        self.hidden_size = hidden_size\n        self.num_layers = num_layers\n        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)\n        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)\n\n    def forward(self, x):', 'middle': '        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)', 'suffix': '\n        # forward prop \n        out, _ = self.rnn(x, h0)\n        out = out.reshape(out.shape[0], -1)\n        out = self.fc(out)\n        \n        return out\n', 'generated_middle': '<fim_suffix>'}


  0%|          | 0/2 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
 50%|█████     | 1/2 [01:36<01:36, 96.99s/it]

<fim_prefix>import torch 
import sys 
import os

import torch.nn as nn 
from config import device
# Hyperparameters 
sequence_length = 28 

# create RNN
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)

    def forward(self, x):<fim_suffix>
        # forward prop 
        out, _ = self.rnn(x, h0)
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        
        return out
<fim_middle>
        # forward forward prop 
        out, _ = self.rnn(x)
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        
        return out

# create CNN
class CNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        su

100%|██████████| 2/2 [01:37<00:00, 48.92s/it]

<fim_prefix>import torch 
import sys 
import os

import torch.nn as nn 
from config import device
# Hyperparameters 
sequence_length = 28 

# create RNN<fim_suffix>    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # forward prop 
        out, _ = self.rnn(x, h0)
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        
        return out
<fim_middle>
class RNN(nn.Module):
<|endoftext|>
<class 'str'>
{'prefix': 'import torch \nimport sys \nimport os\n\nimport torch.nn as nn \nfrom config import device\n# Hyperparameters \nsequence_length = 28 \n\n# create RNN\nclass R




-1

In [40]:
print(generated_text[generated_text.find("<fim_middle>") + len("<fim_middle>"): len(generated_text) - len("<|endoftext|>")])


class RNN(nn.Module):



In [11]:
tokenizer.add_special_tokens({"pad_token": "\t"})
tokenizer.add_special_tokens({"pad_token": "\t"})
tokenizer.add_special_tokens({"pad_token": "\t"})

0

In [12]:
tokenizer()

49152