Adapted from https://huggingface.co/transformers/v3.0.2/notebooks.html

In [None]:
!pip install transformers

In [None]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration


tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')

text = """def svg_to_image(string, size=None):
if isinstance(string, unicode):
    string = string.encode('utf-8')
    renderer = QtSvg.QSvgRenderer(QtCore.QByteArray(string))
if not renderer.isValid():
    raise ValueError('Invalid SVG data.')
if size is None:
    size = renderer.defaultSize()
    image = QtGui.QImage(size, QtGui.QImage.Format_ARGB32)
    painter = QtGui.QPainter(image)
    renderer.render(painter)
return image"""

input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=20)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
# this prints: "Convert a SVG string to a QImage."


In [None]:
import torch
import pandas as pd

In [None]:
df = pd.read_csv('../input/codedataset/python_extract_train.csv')[['code','docstring']].dropna()

In [None]:
input_sequences = df['docstring'].values.flatten().tolist()

In [None]:
output_sequences = df['code'].values.flatten().tolist()


In [None]:
del df

In [None]:
# the following 2 hyperparameters are task-specific
max_source_length = 128
max_target_length = 512

def train_model(input_sequences, output_sequences):
    # encode the inputs
    task_prefix = "Generate Python: "
    #input_sequences = [input_sequence_1, input_sequence_2]
    encoding = tokenizer([task_prefix + sequence for sequence in input_sequences],
                         padding='longest',
                         max_length=max_source_length,
                         truncation=True,
                         return_tensors="pt")
    input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
    
    # encode the targets
    target_encoding = tokenizer(output_sequences,
                                padding='longest',
                                max_length=max_target_length,
                                truncation=True)
    labels = target_encoding.input_ids

    # replace padding token id's of the labels by -100
    labels = [
               [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
    ]
    labels = torch.tensor(labels)
    
    # forward pass
    loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
    return loss

In [None]:
for i in range(0, len(input_sequences) - 3, 3):
    batch_in = input_sequences[i:i+3]
    batch_out = output_sequences[i:i+3]
    loss = train_model(batch_in, batch_out)
    print(f'loss at {i} is: {loss}')

In [None]:
input_sequences[0]

In [None]:
actual = []
predicted = []
for i in range(len(output_sequences)):
    text = output_sequences[i]
    input_ids = tokenizer(text, return_tensors='pt').input_ids
    outputs = model.generate(input_ids)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted.append(pred)
    act = input_sequences[i].split('\n')[0]
    actual.append(act)
    print('Predicted: ', pred)
    print('Actual: ', act)
    print()

In [None]:
model.save_pretrained('codeT5_sum')