In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

# Load pre-trained model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load the dataset
dataset = load_dataset("codeparrot/conala-mined-curated")
print(dataset.column_names)

# Preprocess the dataset
def tokenize_function(examples):
    inputs = tokenizer(["translate English to Python: " + doc for doc in examples["rewritten_intent"]], truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(examples["snippet"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = labels["input_ids"]
    return inputs

# Split the dataset into training and validation sets
dataset = dataset['train'].train_test_split(test_size=0.1)
train_dataset = dataset['train'].shard(index=0, num_shards=300)
validation_dataset = dataset['test'].shard(index=0, num_shards=300)

# Tokenize the datasets
tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True, batch_size=1000)
tokenized_validation_datasets = validation_dataset.map(tokenize_function, batched=True, batch_size=1000)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=30,
    fp16=True,  # Enable mixed precision training
)

# Move model to GPU if available
if torch.cuda.is_available():
    model.cuda()
    print("Using GPU!")

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_validation_datasets,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

# Save the model and tokenizer
model.save_pretrained("./p3Model")
tokenizer.save_pretrained("./p3Tokenizer")

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'train': ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'rewritten_intent', 'id']}


Map: 100%|██████████| 1782/1782 [00:01<00:00, 1487.86 examples/s]
Map: 100%|██████████| 198/198 [00:00<00:00, 1379.63 examples/s]


Using GPU!


  7%|▋         | 30/446 [00:39<10:23,  1.50s/it]

{'loss': 1.6864, 'grad_norm': 0.14232756197452545, 'learning_rate': 0.0018744394618834081, 'epoch': 0.13}


 13%|█▎        | 60/446 [01:33<11:45,  1.83s/it]

{'loss': 0.2694, 'grad_norm': 0.11841235309839249, 'learning_rate': 0.0017399103139013453, 'epoch': 0.27}


 20%|██        | 90/446 [02:59<28:41,  4.84s/it]

{'loss': 0.2641, 'grad_norm': 0.10743214190006256, 'learning_rate': 0.0016053811659192826, 'epoch': 0.4}


 27%|██▋       | 120/446 [04:39<16:40,  3.07s/it]

{'loss': 0.2465, 'grad_norm': 0.16799403727054596, 'learning_rate': 0.0014708520179372198, 'epoch': 0.54}


 34%|███▎      | 150/446 [05:31<06:32,  1.32s/it]

{'loss': 0.1845, 'grad_norm': 0.10975343734025955, 'learning_rate': 0.001336322869955157, 'epoch': 0.67}


 40%|████      | 180/446 [06:11<05:47,  1.31s/it]

{'loss': 0.1997, 'grad_norm': 0.13838952779769897, 'learning_rate': 0.0012017937219730942, 'epoch': 0.81}


 47%|████▋     | 210/446 [06:45<03:51,  1.02it/s]

{'loss': 0.2075, 'grad_norm': 0.11344566196203232, 'learning_rate': 0.0010672645739910314, 'epoch': 0.94}


                                                 
 50%|█████     | 223/446 [07:02<03:59,  1.07s/it]

{'eval_loss': 0.1906210333108902, 'eval_runtime': 3.7564, 'eval_samples_per_second': 52.71, 'eval_steps_per_second': 6.655, 'epoch': 1.0}


 54%|█████▍    | 240/446 [07:49<09:22,  2.73s/it]

{'loss': 0.1844, 'grad_norm': 0.07202663272619247, 'learning_rate': 0.0009327354260089686, 'epoch': 1.08}


 61%|██████    | 270/446 [09:21<08:48,  3.00s/it]

{'loss': 0.1567, 'grad_norm': 0.11686361581087112, 'learning_rate': 0.0007982062780269058, 'epoch': 1.21}


 67%|██████▋   | 300/446 [10:14<04:16,  1.76s/it]

{'loss': 0.1657, 'grad_norm': 0.08004018664360046, 'learning_rate': 0.000663677130044843, 'epoch': 1.35}


 74%|███████▍  | 330/446 [11:10<05:11,  2.69s/it]

{'loss': 0.1576, 'grad_norm': 0.16956141591072083, 'learning_rate': 0.0005291479820627803, 'epoch': 1.48}


 81%|████████  | 360/446 [12:35<04:11,  2.92s/it]

{'loss': 0.1874, 'grad_norm': 0.11554933339357376, 'learning_rate': 0.0003946188340807175, 'epoch': 1.61}


 87%|████████▋ | 390/446 [14:02<02:38,  2.84s/it]

{'loss': 0.2097, 'grad_norm': 0.14716693758964539, 'learning_rate': 0.0002600896860986547, 'epoch': 1.75}


 94%|█████████▍| 420/446 [16:24<02:59,  6.90s/it]

{'loss': 0.1741, 'grad_norm': 0.06998071074485779, 'learning_rate': 0.00012556053811659193, 'epoch': 1.88}


                                                 
100%|██████████| 446/446 [17:54<00:00,  2.41s/it]


{'eval_loss': 0.17699161171913147, 'eval_runtime': 5.1608, 'eval_samples_per_second': 38.366, 'eval_steps_per_second': 4.844, 'epoch': 2.0}
{'train_runtime': 1074.5677, 'train_samples_per_second': 3.317, 'train_steps_per_second': 0.415, 'train_loss': 0.3011160944609364, 'epoch': 2.0}


100%|██████████| 25/25 [00:05<00:00,  4.95it/s]


Evaluation results: {'eval_loss': 0.17699161171913147, 'eval_runtime': 5.082, 'eval_samples_per_second': 38.961, 'eval_steps_per_second': 4.919, 'epoch': 2.0}


('./p3Tokenizer\\tokenizer_config.json',
 './p3Tokenizer\\special_tokens_map.json',
 './p3Tokenizer\\spiece.model',
 './p3Tokenizer\\added_tokens.json')

In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

model = T5ForConditionalGeneration.from_pretrained("./p3Model")
tokenizer = T5Tokenizer.from_pretrained("./p3Tokenizer")

if torch.cuda.is_available():
    model.cuda()
    print("Moved model to GPU")

def generate_code(model, tokenizer, text):
    # Preprocess the text
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    # Move inputs to GPU if available
    if torch.cuda.is_available():
        inputs = inputs.cuda()
    # Generate code
    code_ids = model.generate(inputs, max_length=256, num_beams=4, early_stopping=True)
    # Decode the code
    code = tokenizer.decode(code_ids[0], skip_special_tokens=True)
    return code

# Example text input
text_inputs = ["converting integer to list",
                "append to a list python",
                "remove element from a list by value",
                "get numpy array",]
for text_input in text_inputs:
    generated_code = generate_code(model, tokenizer, text_input)
    print(f"Generated code: {generated_code}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Moved model to GPU
Generated code: def __init__(list): if __init__(list): pass
Generated code: append(python)
Generated code: def remove element from a list by value
Generated code: for i in range(items): if items[items[items[items[items[items[items[items[items[items[items[items[items[items[items[items[items[items[items][items[items[items[items[items][i]]): i in range(items): pass


In [3]:
# Load the CoNaLa dataset for code summarization
dataset = load_dataset("codeparrot/conala-mined-curated")

print(dataset.column_names)
# Print the first 5 instances from the dataset
for i in range(20):
    print(dataset['train'][i])

{'train': ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'rewritten_intent', 'id']}
{'question_id': 34705205, 'parent_answer_post_id': 34705233, 'prob': 0.8690001442846342, 'snippet': 'sorted(l, key=lambda x: (-int(x[1]), x[0]))', 'intent': 'Sort a nested list by two elements', 'rewritten_intent': "sort a nested list l by two elements '1' and '0'", 'id': '34705205_34705233_0'}
{'question_id': 13905936, 'parent_answer_post_id': 13905946, 'prob': 0.8526701436370034, 'snippet': '[int(x) for x in str(num)]', 'intent': 'converting integer to list in python', 'rewritten_intent': 'convert integer num to list', 'id': '13905936_13905946_0'}
{'question_id': 13837848, 'parent_answer_post_id': 13838041, 'prob': 0.8521431843789492, 'snippet': "c.decode('unicode_escape')", 'intent': 'Converting byte string in unicode string', 'rewritten_intent': 'convert byte string c to unicode string', 'id': '13837848_13838041_0'}
{'question_id': 23490152, 'parent_answer_post_id': 23490179, 