In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load the CoNaLa dataset for code summarization
dataset = load_dataset("codeparrot/conala-mined-curated")

print(dataset.column_names)
# Print the first 5 instances from the dataset
for i in range(5):
    print(dataset['train'][i])

# Preprocess the dataset
def tokenize_function(examples):
    # Use the 'snippet' field for input and 'intent' field for output
    inputs = tokenizer(["summarize: " + code for code in examples["snippet"]], truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(examples["rewritten_intent"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

# Split the training data into training and validation sets
dataset = dataset['train'].train_test_split(test_size=0.1)

train_dataset = dataset['train'].shard(index=0, num_shards=300)
validation_dataset = dataset['test'].shard(index=0, num_shards=300)

# Tokenize the datasets
tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True, batch_size=1000)
tokenized_validation_datasets = validation_dataset.map(tokenize_function, batched=True, batch_size=1000)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=250,  # Evaluate every 250 steps
    logging_steps=50,
    learning_rate=2e-3,
    per_device_train_batch_size=16,  # Reduced batch size
    per_device_eval_batch_size=16,  # Reduced batch size
    num_train_epochs=5,  # Increased number of epochs
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=4,
    save_steps=500,
    fp16=True,  # Enable mixed precision training
)

# Move model to GPU if available
if torch.cuda.is_available():
    model.cuda()
    print("Using GPU")

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_validation_datasets,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
stats = trainer.evaluate()

print(f"Stats of the trained model: {stats}")

# Save the model and tokenizer
model.save_pretrained("./p2Model")
tokenizer.save_pretrained("./p2Tokenizer")

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'train': ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'rewritten_intent', 'id']}
{'question_id': 34705205, 'parent_answer_post_id': 34705233, 'prob': 0.8690001442846342, 'snippet': 'sorted(l, key=lambda x: (-int(x[1]), x[0]))', 'intent': 'Sort a nested list by two elements', 'rewritten_intent': "sort a nested list l by two elements '1' and '0'", 'id': '34705205_34705233_0'}
{'question_id': 13905936, 'parent_answer_post_id': 13905946, 'prob': 0.8526701436370034, 'snippet': '[int(x) for x in str(num)]', 'intent': 'converting integer to list in python', 'rewritten_intent': 'convert integer num to list', 'id': '13905936_13905946_0'}
{'question_id': 13837848, 'parent_answer_post_id': 13838041, 'prob': 0.8521431843789492, 'snippet': "c.decode('unicode_escape')", 'intent': 'Converting byte string in unicode string', 'rewritten_intent': 'convert byte string c to unicode string', 'id': '13837848_13838041_0'}
{'question_id': 23490152, 'parent_answer_post_id': 23490179, 

Map: 100%|██████████| 1782/1782 [00:00<00:00, 3261.05 examples/s]
Map: 100%|██████████| 198/198 [00:00<00:00, 3089.97 examples/s]


Using GPU


  9%|▉         | 50/560 [00:35<04:25,  1.92it/s]

{'loss': 1.4158, 'grad_norm': 0.3081796169281006, 'learning_rate': 0.0018285714285714285, 'epoch': 0.45}


 18%|█▊        | 100/560 [01:02<04:00,  1.91it/s]

{'loss': 0.4053, 'grad_norm': 0.3990699350833893, 'learning_rate': 0.00165, 'epoch': 0.89}


 27%|██▋       | 150/560 [01:37<05:30,  1.24it/s]

{'loss': 0.3552, 'grad_norm': 0.23366963863372803, 'learning_rate': 0.0014714285714285717, 'epoch': 1.34}


 36%|███▌      | 200/560 [02:16<05:05,  1.18it/s]

{'loss': 0.3428, 'grad_norm': 0.2951486110687256, 'learning_rate': 0.001292857142857143, 'epoch': 1.79}


 45%|████▍     | 250/560 [03:14<07:25,  1.44s/it]

{'loss': 0.3091, 'grad_norm': 0.1736857146024704, 'learning_rate': 0.0011142857142857144, 'epoch': 2.23}


                                                 
 45%|████▍     | 250/560 [03:18<07:25,  1.44s/it]

{'eval_loss': 0.33878087997436523, 'eval_runtime': 4.4906, 'eval_samples_per_second': 44.092, 'eval_steps_per_second': 2.895, 'epoch': 2.23}


 54%|█████▎    | 300/560 [04:37<06:19,  1.46s/it]

{'loss': 0.2938, 'grad_norm': 0.21036379039287567, 'learning_rate': 0.0009357142857142857, 'epoch': 2.68}


 62%|██████▎   | 350/560 [05:47<03:19,  1.05it/s]

{'loss': 0.2716, 'grad_norm': 0.28615134954452515, 'learning_rate': 0.0007571428571428572, 'epoch': 3.12}


 71%|███████▏  | 400/560 [06:34<02:28,  1.08it/s]

{'loss': 0.2489, 'grad_norm': 0.21100544929504395, 'learning_rate': 0.0005785714285714286, 'epoch': 3.57}


 80%|████████  | 450/560 [07:19<01:23,  1.31it/s]

{'loss': 0.2538, 'grad_norm': 0.1764555275440216, 'learning_rate': 0.0004, 'epoch': 4.02}


 89%|████████▉ | 500/560 [08:10<01:01,  1.02s/it]

{'loss': 0.2191, 'grad_norm': 0.20747998356819153, 'learning_rate': 0.00022142857142857142, 'epoch': 4.46}


                                                 
 89%|████████▉ | 500/560 [08:13<01:01,  1.02s/it]

{'eval_loss': 0.3472926616668701, 'eval_runtime': 3.2426, 'eval_samples_per_second': 61.062, 'eval_steps_per_second': 4.009, 'epoch': 4.46}


 98%|█████████▊| 550/560 [09:13<00:13,  1.34s/it]

{'loss': 0.229, 'grad_norm': 0.2352881133556366, 'learning_rate': 4.2857142857142856e-05, 'epoch': 4.91}


100%|██████████| 560/560 [09:25<00:00,  1.01s/it]


{'train_runtime': 565.8606, 'train_samples_per_second': 15.746, 'train_steps_per_second': 0.99, 'train_loss': 0.3919362540755953, 'epoch': 5.0}


100%|██████████| 13/13 [00:04<00:00,  3.18it/s]


Stats of the trained model: {'eval_loss': 0.34741973876953125, 'eval_runtime': 4.1381, 'eval_samples_per_second': 47.848, 'eval_steps_per_second': 3.142, 'epoch': 5.0}


('./p2Tokenizer\\tokenizer_config.json',
 './p2Tokenizer\\special_tokens_map.json',
 './p2Tokenizer\\spiece.model',
 './p2Tokenizer\\added_tokens.json')

In [3]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("./p2Model")
tokenizer = T5Tokenizer.from_pretrained("./p2Tokenizer")

def generate_summary(model, tokenizer, code_snippet):
    # Preprocess the code snippet
    inputs = tokenizer.encode("summarize: " + code_snippet, return_tensors="pt", max_length=512, truncation=True)
    # Move inputs and model to GPU if available
    if torch.cuda.is_available():
        inputs = inputs.cuda()
        model = model.cuda()
    # Generate summary
    summary_ids = model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example code snippet
code_snippets = ["sorted(l, key=lambda x: (-int(x[1]), x[0]))",
                 "def multiply(a, b): return a * b",
                 "def divide(a, b): return a / b",
                 "words = text.split(' ') \ last = words[0] \ for word in words: \ if word > last: \ last = word \ return last"]
for i, code_snippet in enumerate(code_snippets):
    summary = generate_summary(model, tokenizer, code_snippet)
    print(f"Code snippet {i+1} summary: {summary}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Code snippet 1 summary: sort list l by key 1
Code snippet 2 summary: multiply a list a by two elements
Code snippet 3 summary: divide a list a by b
Code snippet 4 summary: split string words into two parts
