In [None]:
pip install pandas numpy scikit-learn transformers torch




In [None]:
import re

# Read the dataset
with open('train.txt', 'r') as file:
    lines = file.readlines()

code_snippets = []
comments = []

i = 0
while i < len(lines):
    line = lines[i].strip()

    # Skip Snippet Labels like '# Snippet 1'
    if line.startswith('# Snippet'):
        i += 1
        continue

    # If the line is a function definition
    if line.startswith('def') or line.startswith('class'):
        code = line
        i += 1
        # Gather full code snippet
        while i < len(lines) and lines[i].strip() != '' and not lines[i].strip().startswith('def') and not lines[i].strip().startswith('class') and not lines[i].strip().startswith('# Snippet'):
            code += '\n' + lines[i].rstrip()
            i += 1

        # Extract the comment inside the code
        # Look for a comment line inside the snippet
        match = re.search(r'#\s*(.*)', code)
        if match:
            comment = match.group(1).strip()
            comments.append(comment)
            code_snippets.append(code)

    else:
        i += 1

# Result check
print(f"Extracted {len(code_snippets)} code snippets and {len(comments)} comments.")
print("\nSample Code Snippet:\n", code_snippets[0])
print("\nSample Comment:\n", comments[0])


Extracted 181 code snippets and 181 comments.

Sample Code Snippet:
 def add_numbers(a, b):
    # This function adds two numbers
    return a + b

Sample Comment:
 This function adds two numbers


In [None]:
print(comments)

['This function adds two numbers', 'This function returns the maximum value in a list', 'This function calculates the factorial of a number', 'This function checks if a string is a palindrome', 'This function implements the merge sort algorithm', 'Helper function to merge two sorted lists', 'This function returns the nth Fibonacci number', 'This function reverses a string', 'This function counts the number of vowels in a string', 'This function performs binary search to find a target in a sorted array', "This function calculates the greatest common divisor of two numbers using Euclid's algorithm", 'This function merges two sorted lists into one sorted list', 'This function checks if a number is prime', 'This function calculates the sum of digits of a number', 'This function removes duplicates from a list', 'This function counts the occurrences of a target element in a list', 'This function implements the quick sort algorithm', 'This function calculates the power of a number (base^exp)'

In [None]:
import re
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Clean comment: lowercase, remove punctuation
def clean_comment(comment):
    comment = comment.lower()
    comment = re.sub(rf"[{re.escape(string.punctuation)}]", "", comment)
    return comment

# Clean code: remove extra spaces, lowercase
def clean_code(code):
    code = code.lower()
    code = re.sub(r'\s+', ' ', code).strip()
    return code

# Clean all data
cleaned_comments = [clean_comment(c) for c in comments]
cleaned_codes = [clean_code(c) for c in code_snippets]

# Initialize tokenizers
comment_tokenizer = Tokenizer(oov_token='<OOV>')
code_tokenizer = Tokenizer(oov_token='<OOV>')

# Fit tokenizers
comment_tokenizer.fit_on_texts(cleaned_comments)
code_tokenizer.fit_on_texts(cleaned_codes)

# Convert texts to sequences
comment_sequences = comment_tokenizer.texts_to_sequences(cleaned_comments)
code_sequences = code_tokenizer.texts_to_sequences(cleaned_codes)

# Pad sequences
max_comment_len = max(len(seq) for seq in comment_sequences)
max_code_len = max(len(seq) for seq in code_sequences)

comment_padded = pad_sequences(comment_sequences, maxlen=max_comment_len, padding='post')
code_padded = pad_sequences(code_sequences, maxlen=max_code_len, padding='post')

# Print shapes
print("Padded code shape:", code_padded.shape)
print("Padded comment shape:", comment_padded.shape)


Padded code shape: (181, 58)
Padded comment shape: (181, 15)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Vocabulary sizes
code_vocab_size = len(code_tokenizer.word_index) + 1
comment_vocab_size = len(comment_tokenizer.word_index) + 1

# Hyperparameters
embedding_dim = 128
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(code_padded.shape[1],))
enc_emb = Embedding(input_dim=code_vocab_size, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(enc_emb)

# Decoder
decoder_inputs = Input(shape=(comment_padded.shape[1],))
dec_emb_layer = Embedding(input_dim=comment_vocab_size, output_dim=embedding_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

decoder_dense = Dense(comment_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
import numpy as np

# Decoder target data (one timestep ahead)
decoder_target_data = np.zeros_like(comment_padded)
decoder_target_data[:, :-1] = comment_padded[:, 1:]


In [None]:
history = model.fit(
    [code_padded, comment_padded],
    decoder_target_data[..., np.newaxis],  # add channel dimension
    batch_size=16,
    epochs=30,
    validation_split=0.1
)


Epoch 1/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 273ms/step - accuracy: 0.3332 - loss: 5.0933 - val_accuracy: 0.4702 - val_loss: 4.0058
Epoch 2/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 214ms/step - accuracy: 0.5736 - loss: 3.7238 - val_accuracy: 0.5088 - val_loss: 3.5665
Epoch 3/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 240ms/step - accuracy: 0.6053 - loss: 3.2618 - val_accuracy: 0.5193 - val_loss: 3.4518
Epoch 4/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 221ms/step - accuracy: 0.6165 - loss: 3.0908 - val_accuracy: 0.5263 - val_loss: 3.4285
Epoch 5/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 215ms/step - accuracy: 0.6063 - loss: 2.9946 - val_accuracy: 0.5544 - val_loss: 3.3251
Epoch 6/30
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 224ms/step - accuracy: 0.6374 - loss: 2.8545 - val_accuracy: 0.5579 - val_loss: 3.2435
Epoch 7/30
[1m11/11[0m [3

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load trained model
model = T5ForConditionalGeneration.from_pretrained("comment_generator_model")
tokenizer = T5Tokenizer.from_pretrained("comment_generator_model")
model.eval()


T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
# Encoder inference model
encoder_model_inf = Model(encoder_inputs, [state_h, state_c])


In [None]:
# Define latent dimension
latent_dim = 256  # Make sure it's the same as used during training

# Load the embedding layer from training (if applicable)
# OR re-create it (if using tokenizer)
vocab_size = len(comment_tokenizer.word_index) + 1

# Define decoder inputs for inference (1 time step at a time)
decoder_inputs_inf = Input(shape=(1,), name='decoder_inputs_inf')

# Create embedding layer
decoder_embedding_layer = Embedding(input_dim=vocab_size, output_dim=latent_dim)
decoder_embedded_inf = decoder_embedding_layer(decoder_inputs_inf)

# Decoder LSTM setup
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    decoder_embedded_inf, initial_state=decoder_states_inputs)

# Decoder dense layer
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

# Define the decoder model
model_decoder = Model(
    [decoder_inputs_inf] + decoder_states_inputs,
    [decoder_outputs_inf, state_h_inf, state_c_inf]
)


In [None]:
def decode_sequence(input_seq):
    # Initialize the target sequence with the 'start' token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = comment_tokenizer.word_index[start_token]  # 'start' token

    # Get the initial states from the encoder
    encoder_states = model_encoder.predict(input_seq)

    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        # Ensure that only the expected 3 inputs are passed: input_seq, target_seq, encoder_states
        output_tokens, h, c = model_decoder.predict([input_seq, target_seq] + encoder_states)

        # Sample the predicted word (token)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = comment_tokenizer.index_word.get(sampled_token_index, '')

        # Exit condition: stop if the 'end' token is generated or max length is reached
        if sampled_word == end_token or len(decoded_sentence.split()) >= max_comment_length:
            stop_condition = True
        else:
            decoded_sentence += " " + sampled_word

        # Update the target sequence and encoder states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        encoder_states = [h, c]  # Update encoder states for the next step

    return decoded_sentence.strip()

# Example of how to use the function
sample_index = 0
sample_input = code_padded[sample_index:sample_index + 1]

# Predict the comment
predicted_comment = decode_sequence(sample_input)

# Print the results
print("Predicted comment:", predicted_comment)
print("Actual code:\n", code_snippets[sample_index])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step


ValueError: Layer "functional_5" expects 3 input(s), but it received 4 input tensors. Inputs received: [<tf.Tensor 'data:0' shape=(1, 58) dtype=int32>, <tf.Tensor 'data_1:0' shape=(1, 1) dtype=float32>, <tf.Tensor 'data_2:0' shape=(1, 256) dtype=float32>, <tf.Tensor 'data_3:0' shape=(1, 256) dtype=float32>]

In [None]:
# Test with the 5th snippet (index = 4)
sample_index = 4
sample_input = code_padded[sample_index:sample_index + 1]

# Predict the comment
predicted_comment = decode_sequence(sample_input)

# Print the results
print("Predicted comment:", predicted_comment)
print("Actual code:\n", code_snippets[sample_index])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step


KeyError: '<start>'