In [1]:
import os
import glob
import numpy as np
import pandas as pd

In [2]:
def getNgrams(n, text,allNgrams):
    nGrams = {}
    for i in range(len(text)-n+1):
        current = text[i]
        for j in range(i+1,n+i):
            current += " " + (text[j])
        if current not in allNgrams:
            allNgrams.append(current)
        if current not in nGrams:
            nGrams[current] = 1
        else:
            nGrams[current] += 1
    return nGrams,allNgrams


In [3]:
def getVector(word_list, full_word_list):
    vector = []
    for word in full_word_list:
        if word not in word_list:
            vector.append(0)
        else:
            vector.append(word_list[word])
    return vector

In [4]:
def cosineSimilarity(word_vectors):
    
    for i in range(len(word_vectors)):
        word_vectors[i] = np.array(word_vectors[i])
        
    numerator = word_vectors[0]

    for i in range(1, len(word_vectors)):
        numerator = numerator * word_vectors[i]
    
    numerator = np.sum(numerator)

    denominator = 1

    for i in range(len(word_vectors)):
        denominator *= np.sqrt(np.sum(word_vectors[i] ** 2))

    return numerator / denominator

In [5]:
import javalang

def getTokens (file):
    token_words = []

    tokens = list(javalang.tokenizer.tokenize(file))
    parser = javalang.parser.Parser(tokens)
    
    for i in tokens:
       token_words.append(type(i).__name__)
    
    return token_words

In [6]:
import javalang

def getValues (file):
    token_words = []

    tokens = list(javalang.tokenizer.tokenize(file))
    parser = javalang.parser.Parser(tokens)
    
    for i in tokens:
       token_words.append(i.value)
    
    return token_words

In [7]:
def split_list(lst, chunk_size):
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

In [8]:
import math
def duplicate(values_min, values_max):
    div = math.ceil(len(values_max) / len(values_min))
    chunk_size =math.ceil( len(values_max) / div)
    values2 = split_list(values_max,chunk_size)

    allNgrams = []
    maxCos = 0
    for i in values2:
        nGrams1,allNgrams = getNgrams(3,values_min,allNgrams)
        nGrams2,allNgrams = getNgrams(3,i,allNgrams)

        vector1 = getVector(nGrams1,allNgrams)
        vector2 = getVector(nGrams2,allNgrams)

        aux = cosineSimilarity([vector1, vector2])

        if(maxCos < aux):
            maxCos = aux

    return maxCos
        

In [9]:
def get_PredictionTokens (file1_string, file2_string):

    allNgrams = []

    token_words1 = getTokens(file1_string)
    token_words2 = getTokens(file2_string)

    maxCos = 0
    if (len(token_words1) / len(token_words2)) > 2:
        maxCos = (duplicate(token_words2,token_words1))

    elif (len(token_words2) / len(token_words1)) > 2:
        maxCos = (duplicate(token_words1,token_words2))

    nGrams1,allNgrams = getNgrams(3,token_words1,allNgrams)
    nGrams2,allNgrams = getNgrams(3,token_words2,allNgrams)

    vector1 = getVector(nGrams1,allNgrams)
    vector2 = getVector(nGrams2,allNgrams)

    cos = cosineSimilarity([vector1, vector2])
    if maxCos > cos:
        return(maxCos)
    else:
        return(cos)
    

In [11]:
def get_PredictionValues (file1_string, file2_string):

    allNgrams = []

    values1 = getValues(file1_string)
    values2 = getValues(file2_string)
    maxCos = 0
    if (len(values1) / len(values2)) > 2:
        maxCos = (duplicate(values2,values1))

    elif (len(values2) / len(values1)) > 2:
        maxCos = (duplicate(values1,values2))

    nGrams1,allNgrams = getNgrams(3,values1,allNgrams)
    nGrams2,allNgrams = getNgrams(3,values2,allNgrams)

    vector1 = getVector(nGrams1,allNgrams)
    vector2 = getVector(nGrams2,allNgrams)

    cos = cosineSimilarity([vector1, vector2])
    if maxCos > cos:
        return(maxCos)
    else:
        return(cos)

In [12]:
import numpy as np

def getWordList(text, unique_word_list):
    
    text = text + " "
    word = ""
    unique_words = unique_word_list
    all_words = []

    for letter in text:
        if letter != " " and letter != '\n':
            if letter.isalpha():
                word += letter.lower()
        else:
            if word not in unique_words:
                unique_words.append(word)
            all_words.append(word)
            word = ""
        
    return all_words, unique_words

def makeMatrix(text_list, unique_word_list):
    unique_word_count = len(unique_word_list)

    matrix = np.zeros(shape = (unique_word_count, unique_word_count))
    
    for i in range(len(text_list[:-1])):
        current_position = unique_word_list.index(text_list[i])
        next_position = unique_word_list.index(text_list[i+1])

        matrix[current_position][next_position] += 1

    for i in range(unique_word_count):
        if matrix[i].sum() != 0:
            matrix[i] = matrix[i] / matrix[i].sum()

    return matrix

def getWordListCode(text, unique_word_list):
    
    text = text + " "
    unique_words = unique_word_list
    word = ""
    all_words = []

    for letter in text:
        if letter != " " and letter != '\n':
            word += letter.lower()
        elif len(word) > 0:
            if word[0].isalpha() and word not in unique_words:
                word = "variable"

            if word not in unique_words:
                unique_words.append(word)

            all_words.append(word)
            word = ""
            
    return all_words, unique_word_list


In [45]:

def markovSImilarity(file1_string, file2_string):
    code1 = file1_string
    code2 = file2_string
    
    code1_list = getValues(code1)
    code2_list = getValues(code2)

    full_word_list_code = list(set(code1_list + code2_list))

    A = makeMatrix(code1_list, full_word_list_code)
    B = makeMatrix(code2_list, full_word_list_code)

    BT = np.transpose(B)
    C = np.matmul(BT, A)
    prod_int = np.trace(C)

    normA = np.sqrt(np.trace(np.matmul(np.transpose(A), A)))
    normB = np.sqrt(np.trace(np.matmul(np.transpose(B), B)))

    cos_ang = prod_int / (normA * normB)

    '''
    print("prod_int: ", prod_int)
    print("normA: ", normA)
    print("normB: ", normB)
    print("cos_ang: ", cos_ang)
    '''

    return cos_ang


In [48]:
df = pd.read_csv('labels.csv')

df['prediction'] = 0
df

data_path = './data'
train_data_path = os.path.join(data_path, 'Train')
java_folder_path = train_data_path + '/*.java'
        
file_pair_list = []

for _, folder_name in enumerate(glob.glob(train_data_path + '/*/')):
    file_pair = []
    for _, file_name in enumerate(glob.glob(folder_name + '/*.java')):
        file_pair.append(file_name)
    
    file_pair_list.append(file_pair)

total = 0
correct = 0
correct_tokens = 0
correct_total = 0
correct_markov = 0

for file_pair in file_pair_list:
    with open(file_pair[0], 'r', encoding = 'utf8') as file1, open(file_pair[1], 'r', encoding = 'utf8') as file2:
        file1_name = os.path.basename(file_pair[0])[:-5]
        file2_name = os.path.basename(file_pair[1])[:-5]

        expected = 1
        current_row = df.loc[(df['sub1'] == file1_name) & (df['sub2'] == file2_name)]

        if len(current_row) > 0 and current_row.iloc[0]['verdict'] == 0:
            expected = 0

        file1_string = file1.read()
        file2_string = file2.read()

        
        prediction_values = get_PredictionValues(file1_string, file2_string)
       
        if prediction_values > 0:
            if (prediction_values >= 0.7 and expected == 1) or (prediction_values < 0.7 and expected == 0):
                correct += 1
            total += 1

        prediction_tokens = get_PredictionTokens(file1_string, file2_string)
        
        if prediction_tokens > 0:
            if (prediction_tokens >= 0.988 and expected == 1) or (prediction_tokens < 0.987 and expected == 0):
                correct_tokens += 1
                
        if ((prediction_values >= 0.7 or prediction_tokens >= 0.988) and expected == 1) or ((prediction_values < 0.7 or prediction_tokens < 0.987) and expected == 0):
            correct_total += 1
        

        prediction_markov = markovSImilarity(file1_string, file2_string)

        if prediction_markov > 0:
            if (prediction_markov >= 0.7 and expected == 1) or (prediction_markov < 0.7 and expected == 0):
                correct_markov += 1
        

print(total)
print(correct/total)
print(correct_tokens/total)
print(correct_total/total)
print(correct_markov/total)

247
0.8137651821862348
