In [16]:
import os
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding,LSTM, Dense, Dropout, Input, Concatenate, TimeDistributed, Layer
from tensorflow.keras import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import binary_crossentropy
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import random
import operator
from tqdm import tqdm
#from focal_loss import BinaryFocalLoss
from tensorflow_addons.losses import SigmoidFocalCrossEntropy, sigmoid_focal_crossentropy
from tensorflow.keras.models import load_model
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.preprocessing import StandardScaler
import re
from keras import Model

In [17]:
print(os.getcwd())

/Users/phanvanhung/TagGenerator/ast_model


In [23]:
dataset_path = '../ast_codeforces/'

In [20]:
tags = list()
Tags_set = dict()
problems = []
with open('../_tags.txt','r') as file:
    content = file.readlines()
    for line in content:
        line = line.rstrip()
        problem = list(eval(line).items())[0][0]
        tag = list(eval(line).items())[0][1]
        problems.append(problem)
        Tags_set[problem]  = tag
        line = list(tag)
        for tag in line:
            if (tag not in tags): tags.append(tag)

In [21]:
print(tags)
print(len(tags))
print(len(content))

['constructive algorithms', 'dp', 'greedy', 'brute force', 'binary search', 'math', 'data structures', 'implementation', 'number theory', 'two pointers', 'combinatorics', 'bitmasks', 'matrices', 'meet-in-the-middle', 'graphs', 'sortings', 'trees', 'interactive', '2-sat', 'geometry', 'strings', 'dfs and similar', 'dsu', 'divide and conquer', 'shortest paths', 'games', 'hashing', 'string suffix structures', 'flows', 'expression parsing', 'chinese remainder theorem', 'probabilities', 'fft', '*special', 'graph matchings', 'ternary search', 'schedules']
37
2167


In [24]:
Dataset = []
for problem in tqdm(os.listdir(dataset_path)):
    if (problem != '.DS_Store' and problem in Tags_set.keys()):
        for text in os.listdir(dataset_path + problem + '/'):
            if (text != '.DS_Store'):
                    with open(dataset_path + problem + '/' + text, 'r') as file:
                        problem_content = file.readlines()
                        for code in problem_content:
                            index = 0
                            code_content = code.split(' ')
                            code_name = code_content[0]
                            code_content = ' '.join(code_content[1:])
                            Dataset.append({
                                'problem': problem,
                                'code_name': code_name,
                                'code_content': code_content,
                                'tags': Tags_set[problem]
                            })

100%|██████████| 2221/2221 [00:02<00:00, 804.79it/s] 


In [25]:
## Splitting dataset depends on problem
Percent = 80
random.seed(43)
train_problems = problems
train_problems = random.sample(train_problems, int(Percent / 100 * len(train_problems)))
test_problems = [x for x in problems if x not in train_problems]
print(len(train_problems), len(test_problems))
train_set = []
for item in Dataset:
    if item['problem'] in train_problems:
        train_set.append(item)
test_set = []
for item in Dataset:
    if item['problem'] in test_problems:
        test_set.append(item)
train_set = pd.DataFrame(data = train_set)
test_set = pd.DataFrame(data = test_set)
print(train_set.shape, test_set.shape)

1733 434
(76913, 4) (19456, 4)


In [26]:
def preprocessing(Dataset):
    Dataset = Dataset.reset_index(drop = True)
    Dataset.sort_index()
    token_col = []
    path_col = []
    value_col = []
    for text in Dataset['code_content']:
        path_list = text.split(' ')
        token_vector = []
        path_vector = []
        value_vector = []
        for index in range(len(path_list) - 1):
                p = path_list[index].split(',')
                token = p[0]
                path = p[1]
                value =p[2]
                token_vector.append(token)
                path_vector.append(path)
                value_vector.append(value)
        token_col.append(token_vector)
        path_col.append(path_vector)
        value_col.append(value_vector)
    Dataset = Dataset.join(pd.DataFrame({'token' : token_col}))
    Dataset = Dataset.join(pd.DataFrame({'path': path_col}))
    Dataset = Dataset.join(pd.DataFrame({'value': value_col}))
    Dataset = Dataset.drop(columns = 'code_content')
    return Dataset
train_set = preprocessing(train_set)
test_set = preprocessing(test_set)

In [27]:
train_set['path'].shape

(76913,)

In [28]:
# Max number of words in each code.
MAX_SEQUENCE_LENGTH = 500
# Dimension of Embedding layer
EMBEDDING_TOKEN_DIM = 128
EMBEDDING_PATH_DIM = 128

MAX_NB_WORDS = 50000

In [29]:
def tokenize_process(data_set):
    default_tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    default_tokenizer.fit_on_texts(train_set['token'].values)
    x_token = default_tokenizer.texts_to_sequences(data_set['token'].values)
    default_tokenizer.fit_on_texts(train_set['path'].values)
    x_path = default_tokenizer.texts_to_sequences(data_set['path'].values)
    default_tokenizer.fit_on_texts(train_set['value'].values)
    x_value = default_tokenizer.texts_to_sequences(data_set['value'].values)
    x_token = pad_sequences(x_token, maxlen=MAX_SEQUENCE_LENGTH)
    x_path = pad_sequences(x_path, maxlen=MAX_SEQUENCE_LENGTH)
    x_value = pad_sequences(x_value, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of data tensor:', x_token.shape, x_path.shape, x_value.shape)
    return x_token, x_path, x_value

In [30]:
x_train_token, x_train_path, x_train_value = tokenize_process(train_set)
x_test_token, x_test_path, x_test_value = tokenize_process(test_set)

KeyError: 'valuex'

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(pd.Series(Tags_set))
y_train = mlb.transform(train_set['tags'])
y_test = mlb.transform(test_set['tags'])
print(y_train.shape, y_test.shape)

# Predict

In [None]:
token_input = Input(shape = (MAX_SEQUENCE_LENGTH,))
path_input = Input(shape = (MAX_SEQUENCE_LENGTH,))
value_input = Input(shape = (MAX_SEQUENCE_LENGTH,))

embedding_token_layer = Embedding(MAX_NB_WORDS, EMBEDDING_TOKEN_DIM)
embedding_path_layer = Embedding(MAX_NB_WORDS, EMBEDDING_PATH_DIM)

token_embedded = embedding_token_layer(token_input) 
path_embedded = embedding_path_layer(path_input)
value_embedded = embedding_token_layer(value_input)

combined_layer = Concatenate()([token_embedded, path_embedded, value_embedded])
combined_layer = Dropout(0.25)(combined_layer)

input_after_dense = TimeDistributed(Dense(EMBEDDING_PATH_DIM + 2 * EMBEDDING_TOKEN_DIM, use_bias = False, activation = 'sigmoid'))(combined_layer)

input_after_LSTM = LSTM(EMBEDDING_PATH_DIM + 2 * EMBEDDING_TOKEN_DIM)(input_after_dense)



target = Dense(37,activation = 'sigmoid')(input_after_LSTM)
model = Model(inputs = [token_input,path_input, value_input], outputs = target)

model.compile(loss = SigmoidFocalCrossEntropy(gamma = 2), optimizer='adam', metrics = ['accuracy'])
model.summary()


checkpoint_dir = 'ast_with_attention.h5'
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_dir, save_weights_only = True, verbose = 1)
epochs = 50
inputs = [x_train_token, x_train_path, x_train_value]
history = model.fit(inputs, y_train, epochs=epochs, validation_data = ([x_test_token, x_test_path, x_test_value],y_test), callbacks=[checkpoint_callback])

In [43]:
checkpoint_dir = 'ast_nonattention_50.h5'
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_dir, save_weights_only = True, verbose = 1)
epochs = 10
inputs = [x_train_token, x_train_path, x_train_value]
history = model.fit(inputs, y_train, epochs=epochs, validation_data = ([x_test_token, x_test_path, x_test_value],y_test), callbacks=[checkpoint_callback])

In [44]:
accr = model.evaluate([x_test_token, x_test_path, x_test_value],y_test)
y_pred = model.predict([x_test_token, x_test_path, x_test_value])

In [45]:
y_check = model.predict(inputs)

In [46]:
for index in range(len(y_check[0])):
    print(y_check[0][index], y_train[0][index])

# Calculate Accuracy

In [51]:
problem_sum = defaultdict()
problem_count = defaultdict()
problem_check = defaultdict()
for index in range(len(y_pred)):
  pos = test_set['problem'][index]
  problem_sum[pos] = np.zeros(y_pred[0].shape).astype(float)
  problem_check[pos] = np.zeros(y_test[0].shape).astype(int)
  problem_count[pos] = 0
for index in range(len(y_pred)):
  pos = test_set['problem'][index]
  problem_sum[pos] = np.sum([problem_sum[pos],y_pred[index]], axis = 0)
  problem_check[pos] = y_test[index]
  problem_count[pos] +=1

In [52]:
def n_tag(y_pred,y_test,n,right_type_count,wrong_type_count):
  _w = 0
  _r = 0
  indexed = list(enumerate(y_pred))
  top = sorted(indexed, key=operator.itemgetter(1))[-n:]
  index_top = list(reversed([i for i, v in top]))
  for index in index_top:
    if (y_test[index] == 1):
        _r+=1
        right_type_count[index] +=1
    else: 
        _w+=1
        wrong_type_count[index] +=1
  return _w,_r,right_type_count, wrong_type_count

In [53]:
def visualize(type_count):
    plt.figure(figsize = (10,10))
    plt.barh(y = tags, width = type_count)
    plt.show()

In [54]:
N = 5
for n in range(1,N):
  wrong = 0
  right = 0
  right_type_count = [0] * 37
  wrong_type_count = [0] * 37
  for index in problem_sum.items():
    w, r, right_type_count, wrong_type_count= n_tag(problem_sum[index[0]],problem_check[index[0]],n,right_type_count,wrong_type_count)
    wrong+=w
    right+=r
  print(n,'tags and number of wrong and right prediction:',wrong,right)
  print(n,'tags result:',right/(wrong + right) * 100)
  visualize(right_type_count)
  visualize(wrong_type_count)

In [None]:
print(len(problem_sum),len(problem_check))

In [None]:
model.save('ast_nonattention_10.h5')

In [42]:
model.load_weights('../Result/ast_nonattention_40.h5')