In [None]:
%pip install transformers==4.25.1

In [None]:
import torch
from tqdm import tqdm
torch.set_default_tensor_type(torch.cuda.FloatTensor)
from transformers import AutoTokenizer, AutoModelForCausalLM

## Load CodeGen Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-2B-mono")

In [None]:
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-2B-mono")

## Sample 1

In [None]:
text_input = "def hello_world():"
inputs = tokenizer(text_input, return_tensors="pt").to(0)
sample = model.generate(**inputs, max_length=128)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  attn_weights = torch.where(causal_mask, attn_weights, mask_value)


In [None]:
print(tokenizer.decode(sample[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]))

def hello_world():
    return 'Hello, world!'


## Sample 2

In [None]:
text_input = "def remove_first_last_character():"
inputs = tokenizer(text_input, return_tensors="pt").to(0)
sample = model.generate(**inputs, max_length=128)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(tokenizer.decode(sample[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]))

def remove_first_last_character():
    """
    Remove the first and last character of a string.
    """
    # Get the input
    input_string = input("Enter a string: ")

    # Remove the first and last character
    first_character = input_string[0]
    last_character = input_string[-1]
    new_string = input_string[1:-1]

    # Display the result
    print("The new string is:", new_string)


## Sample 3

In [None]:
text_input = "# Write a python function to remove first and last occurrence of a given character from the string."
inputs = tokenizer(text_input, return_tensors="pt").to(0)
sample = model.generate(**inputs, max_length=128)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(tokenizer.decode(sample[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]))

# Write a python function to remove first and last occurrence of a given character from the string.

def remove_char(str, char):
    return str.replace(char, "")

print(remove_char("Hello World", "W"))



## Sample 4

In [None]:
text_input = "# remove first and last occurence of a given character from a string"
inputs = tokenizer(text_input, return_tensors="pt").to(0)
sample = model.generate(**inputs, max_length=128)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(tokenizer.decode(sample[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]))

# remove first and last occurence of a given character from a string
#
# Input:
# str = "geeksforgeeks"
# char = 'e'
# Output:
# 'kgeeksf'
#
# Input:
# str = "geeksforgeeks"
# char = 'g'
# Output:
# 'eeksforge'
#
# Input:
# str = "geeksforgeeks"
# char = 'k'
# Output:
# 'eeksforge'
#
# Input:
# str = "ge


## Sample 5

In [None]:
text_input = "# Write a function to remove characters from the first string which are present in the second string."
inputs = tokenizer(text_input, return_tensors="pt").to(0)
sample = model.generate(**inputs, max_length=128)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(tokenizer.decode(sample[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]))

# Write a function to remove characters from the first string which are present in the second string.


## Load MBPP dataset

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset, concatenate_datasets

In [None]:
dataset = load_dataset('mbpp')



  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
train = dataset['train']
test = dataset['test']
validation = dataset['validation']
prompt = dataset['prompt']
# eval_dataset = concatenate_datasets([train, test, validation, prompt])
eval_dataset = test

len(train), len(test), len(validation), len(prompt), len(eval_dataset)

(374, 500, 90, 10, 500)

In [None]:
eval_dataset[7]

{'task_id': 18,
 'text': 'Write a function to remove characters from the first string which are present in the second string.',
 'code': "NO_OF_CHARS = 256\r\ndef str_to_list(string): \r\n\ttemp = [] \r\n\tfor x in string: \r\n\t\ttemp.append(x) \r\n\treturn temp \r\ndef lst_to_string(List): \r\n\treturn ''.join(List) \r\ndef get_char_count_array(string): \r\n\tcount = [0] * NO_OF_CHARS \r\n\tfor i in string: \r\n\t\tcount[ord(i)] += 1\r\n\treturn count \r\ndef remove_dirty_chars(string, second_string): \r\n\tcount = get_char_count_array(second_string) \r\n\tip_ind = 0\r\n\tres_ind = 0\r\n\ttemp = '' \r\n\tstr_list = str_to_list(string) \r\n\twhile ip_ind != len(str_list): \r\n\t\ttemp = str_list[ip_ind] \r\n\t\tif count[ord(temp)] == 0: \r\n\t\t\tstr_list[res_ind] = str_list[ip_ind] \r\n\t\t\tres_ind += 1\r\n\t\tip_ind+=1\r\n\treturn lst_to_string(str_list[0:res_ind]) ",
 'test_list': ['assert remove_dirty_chars("probasscurve", "pros") == \'bacuve\'',
  'assert remove_dirty_chars("dig

## Load Summarizer Model

## Prediction

In [None]:
pred_code = []
gt = []
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
for i in tqdm(range(len(eval_dataset))):
  text = eval_dataset[i]['text']
  gt_code = eval_dataset[i]['code']
  test_list = eval_dataset[i]['test_list']
  new_text = "# " + text
  # for t in test_list:
  #   new_text += t + " "
  inputs = tokenizer(new_text, return_tensors="pt").to(0)
  sample = model.generate(**inputs, max_length=128)
  output = tokenizer.batch_decode(sample, truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"])[0]
  pred_code.append(output)
  gt.append(gt_code)

In [None]:
gt[3]

'def find_Volume(l,b,h) : \r\n    return ((l * b * h) / 2) '

In [None]:
pred_code[3]

'# Write a python function to find the volume of a triangular prism.\n# The function should accept the following inputs:\n# a, b, c, d, e, f\n# The function should return the volume of the prism.\n# The volume of a prism is defined as:\n# V = a * b * c\n# where a, b, c are the lengths of the sides of the prism.\n# Note: The prism is assumed to be a rectangular prism.\n# Hint: You can use the product rule to calculate the volume.\n# Input Format\n# The first line contains the'