In [2]:
import os
import tf_model_modified as tf_model


import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

from sklearn.metrics import precision_recall_fscore_support
from official.modeling import tf_utils
from official import nlp
from official.nlp import bert
from cubert_tokenizer import python_tokenizer, code_to_subtokenized_sentences

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks
from tensor2tensor.data_generators import text_encoder
import pandas as pd

import json
from sklearn import preprocessing
import itertools


In [26]:
tf.get_logger().setLevel('ERROR')

DS_PATH = "../data/_all_data.csv"
EPOCHS = 3
shuffle_buffer_size = 10000
SEQ_LENGTH = 512
BATCH_SIZE = 2
MODEL_PATH = "../bert2"
FREQ_LIMIT = 200
FREQ_CUT_SYMBOL = "<UNK>"

with open(MODEL_PATH+ "/cubert_config.json") as conf_file:
    config_dict = json.loads(conf_file.read())
    bert_config = bert.configs.BertConfig.from_dict(config_dict)

bert_encoder = bert.bert_models.get_transformer_encoder(
    bert_config, sequence_length=SEQ_LENGTH)
bert_encoder.trainable = False
checkpoint = tf.train.Checkpoint(model=bert_encoder)
checkpoint.restore(MODEL_PATH+'/bert1-1').assert_consumed()

if ".json" in DS_PATH:
    data = pd.read_json(DS_PATH)
else:
    data = pd.read_csv(DS_PATH)

tokenizer = python_tokenizer.PythonTokenizer()
subword_tokenizer = text_encoder.SubwordTextEncoder(MODEL_PATH + "/cuvocab.txt")

CLS = subword_tokenizer.encode_without_tokenizing("[CLS]")
SEP = subword_tokenizer.encode_without_tokenizing("[SEP]")


## Preprocessign arg and labels

data['arg_types'] = data['arg_types'].apply(eval)


In [28]:
df_labels = pd.DataFrame(data['arg_types'].values.tolist())

df_labels[pd.isnull(df_labels)]  = 'NaN'
df_labels = df_labels.apply(lambda x: x.mask(x.map(x.value_counts())<FREQ_LIMIT, FREQ_CUT_SYMBOL))

enc = preprocessing.LabelEncoder()
all_types = df_labels.apply(pd.Series).stack().values
enc.fit(all_types)

FREQ_CUT_ENC = enc.transform([FREQ_CUT_SYMBOL])

df3 = df_labels.apply(enc.transform)
data['labels'] = df3.values.tolist()

In [8]:
def train_test_by_repo(data, split=0.75):
  train_l, test_l = [], [] 
  c = 0
  train_len = split*len(data)
  for name, i in data.groupby(['repo']).count().sample(frac=1).iterrows():
    if train_len > c:
      train_l.append(name)
      c+=i['author']
    else:
      test_l.append(name)
  return data.loc[data['repo'].isin(train_l)], data.loc[data['repo'].isin(train_l)]

train_ds, test_ds = train_test_by_repo(data)

In [None]:
def transform(code_text):
    return CLS+sum(code_to_subtokenized_sentences.code_to_cubert_sentences(
        code=code_text,
        initial_tokenizer=tokenizer,
        subword_tokenizer=subword_tokenizer),[])


def process_batch(data_batch):
    def process_elem(data_batch_i):
        id_list = np.zeros((SEQ_LENGTH))
        sentence_line = np.array(transform(data_batch_i['body.1'])[:SEQ_LENGTH-1]+SEP)
        le = len(sentence_line)
        for label, l_types in zip(eval(data_batch_i['arg_names']), data_batch_i['labels']):
          if l_types!=FREQ_CUT_ENC:
            label_sub = sum([subword_tokenizer.encode_without_tokenizing(word) for word in tokenizer.tokenize(label)],[])
            id_list[tuple(np.where(sentence_line == label_sub[0]))] = l_types
        return sentence_line, id_list, le
    
    ids = [] # ner labels for sequence
    full_sentence = [] # here will be the end result of method tokenization
    le = []
    for _,data_batch_i in data_batch.iterrows():
        sentence_line, id_list, length = process_elem(data_batch_i)
        full_sentence.append(sentence_line)
        ids.append(id_list)
        le.append(length)
    return full_sentence, ids, le

def create_dataset(dataset):
  def gen():
      for _, data_batch in dataset.groupby(np.arange(len(dataset))//BATCH_SIZE):
          if len(data_batch) < BATCH_SIZE: continue # just a placeholder for edge case
          full_sentence, ids, le = process_batch(data_batch)
          full_sentence = tf.ragged.constant(full_sentence)
          full_sentence = full_sentence.to_tensor(default_value=0, shape=[BATCH_SIZE, SEQ_LENGTH])
          ids = tf.convert_to_tensor(ids)
          yield ({'input_word_ids': full_sentence,
              'input_mask': ids > 0,
              'input_type_ids': tf.zeros_like(full_sentence),
          },ids, le)

  return tf.data.Dataset.from_generator(
          gen,
          ({"input_word_ids": tf.int32, "input_mask": tf.int32, "input_type_ids": tf.int32}, tf.int32, tf.int32),
          (
              {
                  "input_word_ids": tf.TensorShape([BATCH_SIZE, SEQ_LENGTH]),
                  "input_mask": tf.TensorShape([BATCH_SIZE, SEQ_LENGTH]),
                  "input_type_ids": tf.TensorShape([BATCH_SIZE, SEQ_LENGTH])
              },
              tf.TensorShape([BATCH_SIZE, SEQ_LENGTH]),
              None
          ),
      )

train_dataset = create_dataset(train_ds)
test_dataset = create_dataset(test_ds)


In [None]:
N_CLASSES = len(enc.classes_)
N_CLASSES

61

In [None]:
model = tf_model.TypePredictor(bert_encoder, num_classes=N_CLASSES)
print(tf_model.train(model, train_dataset, test_dataset, epochs=EPOCHS, scorer=precision_recall_fscore_support, learning_rate=0.0003, report_every=100))

loss = 3.9490065574645996, acc = 0.0, top 5 = 0.25, batch=0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


loss = 2.685851172022112, acc = 0.38557807785685166, top 5 = 0.6864953719566211, batch=100
loss = 2.8380062334653404, acc = 0.38600967903835043, top 5 = 0.6829698833920185, batch=200
loss = 2.8046256223901946, acc = 0.36529433444131465, top 5 = 0.7035624760601898, batch=300
loss = 2.791319627110922, acc = 0.353062712318837, top 5 = 0.7055009380906205, batch=400
loss = 2.648143669759867, acc = 0.37243817482190145, top 5 = 0.7324308220958027, batch=500
loss = 2.496346041213827, acc = 0.3986254435057083, top 5 = 0.7558263951066794, batch=600
loss = 2.380954907498227, acc = 0.42236420611854125, top 5 = 0.7634279208120607, batch=700
loss = 2.359141395907849, acc = 0.42525248639953755, top 5 = 0.7708813087989387, batch=800
loss = 2.268474833333325, acc = 0.42948531579926663, top 5 = 0.7915182619898792, batch=900
loss = 2.22192658976281, acc = 0.43088826161937444, top 5 = 0.80153669677802, batch=1000
loss = 2.200266428838736, acc = 0.43477852854141974, top 5 = 0.8106353233760876, batch=1100
l