In [2]:
# This will allow the notebook to run faster
from pathlib import Path
use_cache = True
OUTPUT_PATH = Path('/ds/hamel/code_search_data/outputs')

# Pre-Requisite: Make Sure you have the right files prepared from Step 1

You should have a directory with these files:

1. `{train/valid/test.function}` - these are python function definitions tokenized (by space), 1 line per function.
2. `{train/valid/test.docstring}` - these are docstrings that correspond to each of the python function definitions, and have a 1:1 correspondence with the lines in *.function files.
3. `{train/valid/test.lineage}` - every line in this file contains a link back to the original location (github repo link) where the code was retrieved.  There is a 1:1 correspondence with the lines in this file and the other two files. This is useful for debugging.

Note: I have an `outputs` sub-folder where I dump intermediate and final outputs.  This is optional for you.

In [2]:
! ls /ds/hamel/code_search_data -lah

total 2.3G
drwxr-xr-x  3 1001 1001 6.0K May 15 02:38 .
drwxrwxrwx 27 root root 6.0K May 15 01:08 ..
drwxr-xr-x  3 root root 6.0K May 15 03:18 outputs
-rw-r--r--  1 root root 3.5M May 15 02:36 test.docstring
-rw-r--r--  1 root root  16M May 15 02:37 test.function
-rw-r--r--  1 root root 4.4M May 15 02:38 test.lineage
-rw-r--r--  1 root root 325M May 15 02:37 train.docstring
-rw-r--r--  1 root root 1.4G May 15 02:37 train.function
-rw-r--r--  1 root root 413M May 15 02:38 train.lineage
-rw-r--r--  1 root root  14M May 15 02:37 valid.docstring
-rw-r--r--  1 root root  60M May 15 02:37 valid.function
-rw-r--r--  1 root root  18M May 15 02:38 valid.lineage


# Read Text In From File

In [13]:
from general_utils import read_training_files

In [14]:
train_code, holdout_code, train_comment, holdout_comment = read_training_files('/ds/hamel/code_search_data')



In [15]:
assert len(train_code) == len(train_comment)
assert len(holdout_code) == len(holdout_comment)

# Tokenize Text

In [8]:
from ktext.preprocess import processor

if not use_cache:    
    code_proc = processor(hueristic_pct_padding=.7, keep_n=20000)
    t_code = code_proc.fit_transform(train_code)

    comment_proc = processor(append_indicators=True, hueristic_pct_padding=.7, keep_n=14000, padding ='post')
    t_comment = comment_proc.fit_transform(train_comment)

elif use_cache:
    logging.warning('Not fitting transform function because use_cache=True')



Save tokenized text

In [9]:
import dill as dpickle
import numpy as np

if not use_cache:
    # Save the preprocessor
    with open(OUTPUT_PATH/'py_code_proc.dpkl', 'wb') as f:
        dpickle.dump(code_proc, f)

    with open(OUTPUT_PATH/'py_comment_proc.dpkl', 'wb') as f:
        dpickle.dump(comment_proc, f)

    # Save the processed data
    np.save(OUTPUT_PATH/'py_t_code_vecs.npy', t_code)
    np.save(OUTPUT_PATH/'py_t_comment_vecs.npy', t_comment)

Arrange data for modeling

In [10]:
%reload_ext autoreload
%autoreload 2
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor


encoder_input_data, encoder_seq_len = load_encoder_inputs(OUTPUT_PATH/'py_t_code_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs(OUTPUT_PATH/'py_t_comment_vecs.npy')

Shape of encoder input: (4978625, 55)
Shape of decoder input: (4978625, 14)
Shape of decoder target: (4978625, 14)


In [11]:
num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'py_code_proc.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'py_comment_proc.dpkl')

Size of vocabulary for /ds/hamel/code_search_data/outputs/py_code_proc.dpkl: 20,002
Size of vocabulary for /ds/hamel/code_search_data/outputs/py_comment_proc.dpkl: 15,002


# Build Seq2Seq Model For Summarizing Code - (This Is Only For Transfer Learning)

Will reuse this for the code search task

In [10]:
from seq2seq_utils import build_seq2seq_model

In [11]:
seq2seq_Model = build_seq2seq_model(word_emb_dim=800,
                                    hidden_state_dim=1200,
                                    encoder_seq_len=encoder_seq_len,
                                    num_encoder_tokens=num_encoder_tokens,
                                    num_decoder_tokens=num_decoder_tokens)

In [7]:
seq2seq_Model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 800)    12001600    Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, 55)           0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 800)    3200        Decoder-Word-Embedding[0][0]     
__________________________________________________________________________________________________
Encoder-Mo

### Warning:

if Setting `use_cache = False` this next part takes 4 days to train on AWS on a `p3.2xlarge` instance.

In [3]:
from keras.models import Model, load_model
import pandas as pd
import logging

if not use_cache:

    from keras.callbacks import CSVLogger, ModelCheckpoint
    import numpy as np
    from keras import optimizers

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.00005), loss='sparse_categorical_crossentropy')

    script_name_base = 'py_func_sum_v6_'
    csv_logger = CSVLogger('{:}.log'.format(script_name_base))

    model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                       save_best_only=True)

    batch_size = 900
    epochs = 500
    history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

elif use_cache:
    logging.warning('Not re-training function summarizer seq2seq model because use_cache=True')
    seq2seq_Model = load_model(OUTPUT_PATH/'py_func_sum_v5_.epoch50-val1.68161.hdf5')



# Evaluate Seq2Seq Model

In [19]:
from seq2seq_utils import Seq2Seq_Inference
import pandas as pd

seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp,
                                 decoder_preprocessor=dec_pp,
                                 seq2seq_model=seq2seq_Model)

demo_testdf = pd.DataFrame({'code':holdout_code, 'comment':holdout_comment, 'ref':''})
seq2seq_inf.demo_model_predictions(n=15, df=demo_testdf)




Original Input:
 def purge_samples self raise NotImplementedError Not supported
 

Original Output:
 removes all decision samples


****** Predicted Output ******:
 purge samples from the database



Original Input:
 app teardown_appcontext def close_db_connection exception top _app_ctx_stack top if hasattr top sqlite_db top sqlite_db close
 

Original Output:
 closes the database again at the end of the request .


****** Predicted Output ******:
 closes the database again at the end of the request



Original Input:
 def make_dummy_protein_sequence n_supporting_variant_reads n_supporting_variant_sequences n_supporting_reference_transcripts n_total_variant_sequences None n_total_variant_reads None n_total_reference_transcripts None gene TP53 amino_acids MKHW cdna_sequence CCCATGAAACACTGGTAG variant_cdna_interval_start 8 variant_cdna_interval_end 9 variant_aa_interval_start 1 variant_aa_interval_end 2 number_mismatches 1 if n_total_variant_reads is None n_total_variant_reads n_suppo

In [None]:
seq2seq_inf.evaluate_model(input_strings=holdout_code, 
                           output_strings=holdout_comment, 
                           max_len=None)



HBox(children=(IntProgress(value=0, max=50290), HTML(value='')))