In [1]:
import import_ipynb
import Extract_Python_Pairs as EPP
import Python_tokenizer

import io
from tqdm import tqdm
import collections
import numpy as np
import os

#TENSORFLOW
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,  Embedding, LSTM, Attention
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

import pydot
import graphviz
import matplotlib.pyplot as plt
from keras.utils import plot_model

importing Jupyter notebook from Extract_Python_Pairs.ipynb
importing Jupyter notebook from Python_tokenizer.ipynb


In [2]:
#get the code and comment snippets from the Extract Python Pairs file
code_snippets, comments = EPP.get_python_pairs(EPP.get_all_data())

  0%|          | 61441/16115540 [00:00<00:26, 614379.83it/s]

Completed in 185 seconds


100%|██████████| 16115540/16115540 [00:23<00:00, 671593.78it/s] 


Total pairs: 2508330


In [3]:
#filter out code pairs which will throw errors during tokenization
clean_code,clean_comments = Python_tokenizer.clean_pairs(code_snippets, comments)

100%|██████████| 2508330/2508330 [09:41<00:00, 4310.35it/s]

failed: 0.029694258729911934 %
number of clean pairs: 2433847





"\ndef clean_pairs(code, comment):\n    fail_counter=0\n    clean_code = []\n    clean_comments = []\n    for i in tqdm(range(len(code))):\n        try:\n            #this determins if the code snippt can be tested\n            Python_tokenizer.python_to_token_sequence(code[i])\n            clean_code.append(code[i])\n            clean_comments.append(comment[i])\n        except:\n            fail_counter+=1\n    print('failed:',fail_counter/len(code),'%')\n    print('number of clean pairs:',len(clean_code))\n    return clean_code,clean_comments\n\nclean_code,clean_comments = clean_pairs(code_snippets, comments)\n"

In [108]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

def tokenize_comment(x):
    x_tk = Tokenizer(char_level = False, num_words=10000)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

def tokenize_code(x):
    x_tk = Python_tokenizer.Python_Tokenizer(num_words=10000)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [109]:
def preprocess(code, comment):
    preprocess_x, x_tk = tokenize_code(code)
    preprocess_y, y_tk = tokenize_comment(comment)
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    return preprocess_x, preprocess_y, x_tk, y_tk


preproc_code_snippets, preproc_english_comments, code_tokenizer, comment_tokenizer = preprocess(clean_code[:100000], clean_comments[:100000])

max_code_sequence_length = preproc_code_snippets.shape[1]
max_comment_sequence_length = preproc_english_comments.shape[1]
code_vocab_size = len(code_tokenizer.word_index)
comment_vocab_size = len(comment_tokenizer.word_index)


print('Data Preprocessed')
print("Max code sentence length:", max_code_sequence_length)
print("Max comment sentence length:", max_comment_sequence_length)
print("code vocabulary size:", code_vocab_size)
print("comment vocabulary size:", comment_vocab_size)

100%|██████████| 100000/100000 [01:41<00:00, 984.60it/s]


Data Preprocessed
Max code sentence length: 422
Max comment sentence length: 180
code vocabulary size: 157930
comment vocabulary size: 48502


In [110]:
# list all data in history
def plot_history(history):
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    

In [112]:
#define the model
def embed_model(input_shape, output_sequence_length, comment_vocab_size):
    
    learning_rate = 1e-3
    model = Sequential()
    #em can only be used in first layer --> Keras Documentation
    model.add(Embedding(comment_vocab_size, 64, input_length=input_shape[1]))
    model.add(Bidirectional(LSTM(64, return_sequences=True, activation="tanh")))
    model.add(TimeDistributed(Dense(comment_vocab_size, activation="softmax")))
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model


checkpoint_path = "RNN_code2comment/RNN.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                              save_weights_only=True,
                                              save_best_only=True,
                                              verbose=1)



tmp_x = pad(preproc_code_snippets, max_comment_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_english_comments.shape[-2]))

#load the model
embeded_model = embed_model(
    tmp_x.shape,
    max_comment_sequence_length,
    comment_vocab_size+1)


#train the model
embed_model_history = embeded_model.fit(tmp_x, 
                                        preproc_english_comments, 
                                        batch_size=100, 
                                        epochs=10, 
                                        validation_split=0.2,
                                        callbacks=[callback])

#plot the model
plot_history(embed_model_history)


Epoch 1/10

KeyboardInterrupt: 

In [88]:
def translate(x, y, x_tk, y_tk, model, sentence):
    tmp_X = pad(preproc_code_snippets)
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    
    #sentence = [x_tk.word_index[word] for word in sentence.split()]
    #sentence = [x_tk.word_index[word] for word in Python_tokenizer.python_to_token_sequence(sentence)]
    s=[]
    for token in Python_tokenizer.python_to_token_sequence(sentence):
        try:
            s.append(x_tk.word_index[token])
        except:
            pass
    sentence = s
        
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentence = np.array([sentence[0], x[0]])
    
    predictions = model.predict(sentence)
    output = [y_id_to_word[np.argmax(x)] for x in predictions[0]]
    print(output)
    translation = []
    for word in output:
        if word == '<PAD>':
            break
        else:
            translation.append(word)
    print('here')
    print(' '.join(translation))

sentence = clean_code[2002]
print(sentence)
input()
#m = simple_rnn_model
m = embeded_model
#m = bidi_model
#m = encodeco_model
#m = custosm
translate(preproc_code_snippets, preproc_english_comments, code_tokenizer, comment_tokenizer , m, sentence)

class IdfSurface(object):
    
    def __init__(self, name, points, construction):
        self.name = name
        self.points = points
        self.construction = construction
        
class IdfZoneSurface(IdfSurface):



 


ValueError: in user code:

    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1462 predict_function  *
        return step_function(self, iterator)
    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1452 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1445 run_step  **
        outputs = model.predict_step(data)
    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:1418 predict_step
        return self(x, training=False)
    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:976 __call__
        self.name)
    /opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/input_spec.py:180 assert_input_compatibility
        str(x.shape.as_list()))

    ValueError: Input 0 of layer sequential_20 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 356]
