In [1]:
import os
import glob
import numpy as np
import pandas as pd
import javalang

In [2]:
def getNgrams(n, text,allNgrams):
    nGrams = {}
    for i in range(len(text)-n+1):
        current = text[i]
        for j in range(i+1,n+i):
            current += " " + (text[j])
        if current not in allNgrams:
            allNgrams.append(current)
        if current not in nGrams:
            nGrams[current] = 1
        else:
            nGrams[current] += 1
    return nGrams,allNgrams

In [3]:
def getVector(word_list, full_word_list):
    vector = []
    for word in full_word_list:
        if word not in word_list:
            vector.append(0)
        else:
            vector.append(word_list[word])
    return vector

In [4]:
def getTokens (file):
    token_words = []

    tokens = list(javalang.tokenizer.tokenize(file))
    parser = javalang.parser.Parser(tokens)
    
    for i in tokens:
       token_words.append(type(i).__name__)
    
    return token_words

In [5]:
def getValues (file):
    token_words = []

    tokens = list(javalang.tokenizer.tokenize(file))
    parser = javalang.parser.Parser(tokens)
    
    for i in tokens:
       token_words.append(i.value)
    
    return token_words

In [30]:
df = pd.read_csv('../labels.csv')

df['prediction'] = 0
df

data_path = './data'
train_data_path = os.path.join(data_path, 'Train')
java_folder_path = train_data_path + '/*.java'
        
file_pair_list = []

for _, folder_name in enumerate(glob.glob(train_data_path + '/*/')):
    file_pair = []
    for _, file_name in enumerate(glob.glob(folder_name + '/*.java')):
        file_pair.append(file_name)
    
    file_pair_list.append(file_pair)

texts = []
labels = []

for file_pair in file_pair_list:
    with open(file_pair[0], 'r', encoding = 'utf8') as file1, open(file_pair[1], 'r', encoding = 'utf8') as file2:
        file1_name = os.path.basename(file_pair[0])[:-5]
        file2_name = os.path.basename(file_pair[1])[:-5]

        expected = 1
        current_row = df.loc[(df['sub1'] == file1_name) & (df['sub2'] == file2_name)]

        if len(current_row) > 0 and current_row.iloc[0]['verdict'] == 0:
            expected = 0

        file1_string = file1.read()
        file2_string = file2.read()
        
        files_string = ' '.join(getValues(file1_string)) + " " +  ' '.join(getValues(file2_string))

        texts.append(files_string)
        labels.append(expected)

In [25]:
import tensorflow as tf

VOCAB_SIZE = 1000

encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)

encoder.adapt(texts)

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=32,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_5 (Text  (None, None)              0         
 Vectorization)                                                  
                                                                 
 embedding_5 (Embedding)     (None, None, 32)          32000     
                                                                 
 bidirectional_5 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_10 (Dense)            (None, 64)                4160      
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 52865 (206.50 KB)
Trainable params: 5286

In [26]:
history = model.fit(x = texts, y = labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
model.evaluate(x = texts, y = labels)



[0.7442188262939453, 0.7206477522850037]

In [17]:
model.save('./models/plaigarism_detector.keras')

In [20]:
from tensorflow.keras.models import load_model

model2 = load_model('./models/plaigarism_detector.keras')

model2.evaluate(x = texts, y = labels)



[0.7596820592880249, 0.7530364394187927]