## **Download necessary packages and Import important libaries**

mBART-50 is a multilingual Sequence-to-Sequence model pre-trained using the "Multilingual Denoising Pretraining" objective

In [None]:
!pip install transformers -U -q
!pip install sentencepiece
!pip install datasets

!pip install langid
import langid

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch
import os
from tqdm import tqdm

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu

## **Build pre-trained Models**

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
English_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="en_XX")
Hindi_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="hi_IN")
# Bengali_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="bn_IN")
# Gujarati_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="gu_IN")
# Malayalam_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="ml_IN")
# Marathi_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="mr_IN")
# Tamil_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="ta_IN")
# Telugu_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="te_IN")

Downloading:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/649 [00:00<?, ?B/s]

## **Language Support**

In [None]:
lang_models = {
    "English"  : [English_tokenizer, "en_XX"],
    "Hindi" : [Hindi_tokenizer, "hi_IN"],
    # "Bengali" : [Bengali_tokenizer, "bn_IN"],
    # "Gujarati" : [Gujarati_tokenizer, "gu_IN"],
    # "Malayalam" : [Malayalam_tokenizer, "ml_IN"],
    # "Marathi" : [Marathi_tokenizer, "mr_IN"],
    # "Tamil" : [Tamil_tokenizer, "ta_IN"],
    # "Telugu" : [Telugu_tokenizer, "te_IN"],
}

lang_codes = {
    "en" : "English",
    "hi" : "Hindi",
    "bn" : "Bengali",
    "gu" : "Gujarati",
    "ml" : "Malayalam",
    "mr" : "Marathi",
    "ta" : "Tamil",
    "te" : "Telugu",  
}

languages = ["Hindi", "Bengali", "Gujarati", "Malayalam", "Marathi", "Tamil", "Telugu"]

## **Multilingual Translation**

In [None]:
def multilingual_translation(input_text, src_lang, target_lang):
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
  model_inputs = lang_models[src_lang][0](input_text, return_tensors="pt").to(device)
  GPU_model = model.to(device)
  generated_tokens = GPU_model.generate( **model_inputs, forced_bos_token_id=lang_models[src_lang][0].lang_code_to_id[lang_models[target_lang][1]])
  translation = lang_models[src_lang][0].batch_decode(generated_tokens, skip_special_tokens=True)
  return translation

## **Detect languages**

In [None]:
def detect_language(input_text):
  return lang_codes[langid.classify(input_text)[0]]

## **Import Test Dataset**

IIT Bombay English-Hindi corpus (https://www.cfilt.iitb.ac.in/iitb_parallel/)

In [None]:
from datasets import load_dataset
dataset = load_dataset("cfilt/iitb-english-hindi")

In [None]:
score = 0
for translation_pair in tqdm(dataset["test"]["translation"]):
  source_sentence = translation_pair["hi"]
  output_sentence = multilingual_translation(source_sentence, "Hindi", "English")[0].split()
  target_sentence = [translation_pair["en"].split()]
  score += sentence_bleu(target_sentence, output_sentence)
print("BLEU Score for Hindi to English translation : " + str(score/len(dataset["test"]["translation"])))

## **Translation Interface**

In [None]:
input_text = input()
print(multilingual_translation(input_text, "English", "Hindi"))

I went to river bank
['मैं नदी किनारे गया']


## **Import CodeGen pretrained models**

CodeGen is a family of autoregressive language models for program synthesis

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
CodeGenTokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
CodeGenModel = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")

Downloading:   0%|          | 0.00/240 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/999 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/797M [00:00<?, ?B/s]

## **Helper Functions**

In [None]:
def prompt_generator(prompt, function):
  new_prompt = function + " \"\"\"" + prompt + "\"\"\""
  return new_prompt

In [None]:
def get_function_name(function_signature):
  name = function_signature.split()[1]
  return name.split("(")[0]

In [None]:
def output_cleaner(output_code, function_signature):
  entry_point = get_function_name(function_signature)
  parts = output_code.split("def")
  collection = parts[0]
  for i in range(1,len(parts)):
    if entry_point in parts[i]:
      code = collection + "def" + parts[i]
      comment_parts = code.split("\"\"\"")
      final_code = comment_parts[0] + comment_parts[2]
      with open("user.py", "w") as text_file:
        print(final_code, file=text_file)
      break
    else:
      collection = collection + "def" + parts[i]    

In [None]:
def environment_check():
  if "user.py" in os.listdir():
    os.remove("user.py")

## **Code Generation Interface**

## **Naive Interface**

**Example Input** :- Sort a python list of numbers and return the sorted list

In [None]:
prompt = input("Enter description of the programming problem: ")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
input_ids = CodeGenTokenizer(prompt, return_tensors="pt").input_ids.to(device)
CodeGenModel = CodeGenModel.to(device)
generated_ids = CodeGenModel.generate(input_ids, max_length=500, temperature = 0.8)
print(CodeGenTokenizer.decode(generated_ids[0], skip_special_tokens=True))

## **Advanced Interface**
### **Example Input** :- <br>
Function Signature - **def sort(lst):** <br>
Prompt - **Sort a python list of numbers and return the sorted list**

In [None]:
function_signature = input("Enter function signature for your programming problem (Eg. def sort(lst): ): ")
prompt = input("Enter description of the programming problem: ")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
environment_check()
input_ids = CodeGenTokenizer(prompt_generator(prompt, function_signature), return_tensors="pt").input_ids.to(device)
CodeGenModel = CodeGenModel.to(device)
generated_ids = CodeGenModel.generate(input_ids, max_length=500, temperature = 0.8)
output_cleaner(CodeGenTokenizer.decode(generated_ids[0], skip_special_tokens=True), function_signature)

### **Testing Interface**

In [None]:
import importlib
import user
importlib.reload(user)
testcase = input()
eval("user."+get_function_name(function_signature)+"("+testcase+")")

## **HumanEval Benchmark**

In [None]:
!git clone https://github.com/openai/human-eval
!pip install -e human-eval

In [None]:
import gzip 
import json
import pandas as pd    

with gzip.open("/content/human-eval/data/HumanEval.jsonl.gz", mode="rt") as f:
    data = [json.loads(line) for line in f]
  
df = pd.DataFrame(data)

## **Function that checks the correctness of the generated code**

In [None]:
def test(human_eval, index):
  testing_code = human_eval['test'][index]
  l = "def" + testing_code.split("def")[1]
  with open("checker.py", "w") as text_file:
    print(l, file=text_file)

## **Clean the output of the model**

In [None]:
def clean(output_code, human_eval, index):
  parts = output_code.split("def")
  collection = parts[0]
  for i in range(1,len(parts)):
    if human_eval['entry_point'][index] in parts[i]:
      with open("checker.py", "a") as text_file:
        print(collection + "def" + parts[i], file=text_file)
    else:
      collection = collection + "def" + parts[i]
    

In [None]:
import signal

def signal_handler(signum, frame):
  raise Exception("Timed out!")

## **Pass@k Metric**

In [None]:
def pass_at_k(tokenizer, model, human_eval, k, t):
  total_correct = 0
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
  CodeGenModel = model.to(device)
  for i in range(len(human_eval)):
    prompt = human_eval['prompt'][i]
    print(str(i) + ". " + prompt)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    generated_ids = CodeGenModel.generate(input_ids, max_length=500, do_sample=True, top_k=0, top_p=0.95, num_return_sequences=k, temperature = t)
    incorrect=0
    for j in range(k):
      test(human_eval, i)
      output_code = tokenizer.decode(generated_ids[j], skip_special_tokens=True)
      clean(output_code, human_eval, i)
      with open("checker.py", "a") as text_file:
        print(f"check({human_eval['entry_point'][i]})", file=text_file)
      signal.signal(signal.SIGALRM, signal_handler)
      signal.alarm(60)
      try:
        exec(open("checker.py").read())
        total_correct += 1
        break
      except:
        incorrect += 1
      os.remove("/content/checker.py")
  return total_correct*100/len(human_eval)

In [None]:
print()
print("Pass@1 for this model is " + str(pass_at_k(CodeGenTokenizer, CodeGenModel, df, 1, 0.8)))
print("Pass@5 for this model is " + str(pass_at_k(CodeGenTokenizer, CodeGenModel, df, 5, 0.8)))
print("Pass@10 for this model is " + str(pass_at_k(CodeGenTokenizer, CodeGenModel, df, 10, 0.8)))