In [29]:
import tensorflow as tf
import keras
import keras.backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import tokenizer_from_json
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Input, Concatenate, Conv2D, Flatten, Dense, Embedding, LSTM
from keras.models import Model

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from bs4 import BeautifulSoup
import json
import os.path



## Read data using pandas


In [2]:
path1 = "/home/erik/TU/ni/plagiate_labeltool/data/labled/PPR [SoSe21]-9. Hausaufgabe - Pflichttest C-Antworten_plagiate.csv"
path2 = "/home/erik/TU/ni/plagiate_labeltool/data/labled/PPR [SoSe21]-9. Hausaufgabe - Pflichttest C-Antworten_labled.csv"
path3 = "/home/erik/TU/ni/plagiate_labeltool/data/labled/PPR [WS2021]-9. Hausaufgabe - Pflichttest C-Antworten_labled.csv"
df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)
df3 = pd.read_csv(path3)
df3 = df3.drop('Unnamed: 0', axis=1)

This is what the data looks like:


In [3]:
# df1.head(2)
# df2.head(2)
# df2.dtypes

## Remove Given Code from Student Solution


In [5]:
def get_given_code(file):
    try:
        with open(file) as xmlstr:
            soup = BeautifulSoup(xmlstr, 'xml')
            answerpreload = soup.find('answerpreload').text
            questiontext = soup.find('questiontext').text
            return answerpreload, questiontext
    except FileNotFoundError:
        return "Keine Vorgabedatei im Repo gefunden", "Keine Vorgabedatei im Repo gefunden"


def remove_given_code(code, preload_file_path):
    answerpreload, _ = get_given_code(preload_file_path)
    # remove empty lines
    code = '\n'.join([s for s in code.splitlines() if s.strip() != ''])
    if answerpreload == 'Keine Vorgabedatei im Repo gefunden':
        return code
    # print(answerpreload)
    answerpreload = answerpreload.replace('\tokenizer', '').replace('\r', '')
    answerpreload = '\n'.join([s for s in answerpreload.splitlines() if s.strip() != ''])
    for ap_tmp in answerpreload.splitlines():
        ap_tmp = re.escape(ap_tmp)
        ap_tmp = '^' + ap_tmp + '$'
        # {{ cr_random.f1 }} --> \S*
        ap_tmp = re.sub(
            r"\\{\\{\\\s*\S+\s*\\}\\}", r"\\S*", ap_tmp)
        for code_tmp in code.splitlines():
            # print((re.match(ap_tmp, code_tmp.replace('\tokenizer', '').replace('\r', ''))!=None, ap_tmp, code_tmp))
            if re.match(ap_tmp, code_tmp.replace('\tokenizer', '').replace('\r', '')):
                code = code.replace(code_tmp+'\n', '', 1)
                break
    return code


Concat the DataFrames


In [6]:
df_list = [df1, df2]

def remove_given_code_from_df(df_list):
    for df in df_list:
        semester = df["semester"].values[0]
        ha = df["ha"].values[0]
        prog_lang = df["prog_lang"].values[0]
        task = df["task"].values[0]
        answerpreload_path = f'../data/code_templates/PPR [{semester}]-{ha}. Hausaufgabe - Pflichttest {prog_lang}-Antworten_{task}.xml'
        df['code1'] = [remove_given_code(code, answerpreload_path) for code in df['code1']]
        df['code2'] = [remove_given_code(code, answerpreload_path) for code in df['code2']]

remove_given_code_from_df(df_list)
concat_df = pd.concat(df_list, ignore_index=True)
concat_df = concat_df.drop('Unnamed: 0', axis=1)

## Use WS2021 Dataset


In [7]:
# prüfe ob code von Aufgabe a oder b ist und entferne die jeweilige vorgabe
def remove_given_code_from_df_WS2021(df_list):
    for df in df_list:
        semester = df["semester"].values[0]
        ha = df["ha"].values[0]
        prog_lang = df["prog_lang"].values[0]
        task = df["task"].values[0]
        number = int(re.findall('(\d+)', task)[0])
        task_2 = re.sub('\d+', str(number-1), task)
        answerpreload_path = f'../data/code_templates/PPR [{semester}]-{ha}. Hausaufgabe - Pflichttest {prog_lang}-Antworten_{task}.xml'
        alternative_answerpreload_path = f'../data/code_templates/PPR [{semester}]-{ha}. Hausaufgabe - Pflichttest {prog_lang}-Antworten_{task_2}.xml'
        # möglich das mehr automatisiert zu machen?
        ap = re.escape('char test[11]= "0123456789";')
        df['code1'] = [remove_given_code(code, alternative_answerpreload_path) if re.findall(ap,code) else remove_given_code(code, answerpreload_path) for code in df['code1']]
        df['code2'] = [remove_given_code(code, alternative_answerpreload_path) if re.findall(ap,code) else remove_given_code(code, answerpreload_path) for code in df['code2']]

remove_given_code_from_df_WS2021([df3])


## Split train and test data


### Load splited data


In [24]:
data_train = pd.read_csv('../data/labled/train_data_HA9.csv',sep=',')
data_val = pd.read_csv('../data/labled/val_data_HA9.csv',sep=',')
data_test= pd.read_csv('../data/labled/test_data_HA9.csv',sep=',')
X_train, y_train = data_train[['code1','code2']], data_train[['label']] 
X_val, y_val = data_val[['code1','code2']], data_val[['label']]
X_test, y_test = data_test[['code1','code2']], data_test[['label']]

### Create splited data


In [9]:
# X_temp, X_test, y_temp, y_test = train_test_split(concat_df[['code1', 'code2']], concat_df['label'], test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

### Save splited data


In [15]:
# pd.concat([X_train,y_train], axis=1).to_csv('../data/labled/train_data_HA9.csv',sep=',',index=False)
# pd.concat([X_val,y_val], axis=1).to_csv('../data/labled/val_data_HA9.csv',sep=',',index=False)
# pd.concat([X_test,y_test], axis=1).to_csv('../data/labled/test_data_HA9.csv',sep=',',index=False)

In [30]:
#TODO remove newlines and ect
def striphtml(data): 
    return re.sub('<.*?>', ' ', str(data)) 

def stripunc(data): 
    return re.sub('[^A-Za-z%\._\[\]]+', ' ', str(data), flags=re.MULTILINE|re.DOTALL) 

In [26]:
# csv_path = "/home/erik/TU/ni/plagiate_labeltool/data/labled/PPR [WS2021]-9. Hausaufgabe - Pflichttest C-Antworten_labled.csv"
# df_test_remove = pd.read_csv(csv_path)

# semester = 'WS2021'
# ha = '9'
# prog_language = 'C'
# task = 'Antwort 9'
# answerpreload_path = f'../data/code_templates/PPR [{semester}]-{ha}. Hausaufgabe - Pflichttest {prog_language}-Antworten_{task}.xml'
# # print(df_test_remove['code2'].values[2])
# result = remove_given_code(df_test_remove['code2'].values[2], answerpreload_path)
# # print(result)


## Preprocess the pairs for the models


In [90]:
def train_tokenizer(data, tokenizer): 
    data['code1_strip'] = data[['code1']].apply(lambda x:striphtml(stripunc((x[0]))), axis=1)
    data['code2_strip'] = data[['code2']].apply(lambda x:striphtml(stripunc((x[0]))), axis=1)
    data['code12'] = data[['code1_strip','code2_strip']].apply(lambda x:str(x[0])+" "+str(x[1]), axis=1)
    tokenizer.fit_on_texts(data['code12'].values)
    return tokenizer

def preprocess_student_pairs(data, tokenizer): 
    # answerpreload_path = f'../data/code_templates/PPR [{semester}]-{ha}. Hausaufgabe - Pflichttest {prog_language}-Antworten_{task}.xml'
    # data['code1_strip'] = data[['code1']].apply(lambda x:remove_given_code(x[0]), axis=1)
    # data['code2_strip'] = data[['code2']].apply(lambda x:striphtml(stripunc((x[0]))), axis=1)
    data['code1_strip'] = data[['code1']].apply(lambda x:striphtml(stripunc((x[0]))), axis=1)
    data['code2_strip'] = data[['code2']].apply(lambda x:striphtml(stripunc((x[0]))), axis=1)
    data['code1_tokend'] = data['code1_strip'].apply(lambda x:tokenizer.texts_to_sequences([str(x)]))
    data['code2_tokend'] = data['code2_strip'].apply(lambda x:tokenizer.texts_to_sequences([str(x)]))
    data['code1_padded'] = data['code1_tokend'].apply(lambda x:pad_sequences(x, maxlen=256, padding='post', truncating='post'))
    data['code2_padded'] = data['code2_tokend'].apply(lambda x:pad_sequences(x, maxlen=256, padding='post', truncating='post'))
    data['code12_padded'] = [np.concatenate((x[0], x[1]), axis=None) for x in data[['code1_padded','code2_padded']].values]
    x = [np.asarray(x).astype('int32') for x in data['code12_padded']]
    x = tf.convert_to_tensor(x)
    # x = tf.convert_to_tensor(x)
    x_array = np.array(x)
    return x_array

### Load a Tokenizer

In [87]:
def load_tokenizer(parent_folder):
    with open(parent_folder + 'tokenizer.json') as file:
        content = json.load(file)
        return tokenizer_from_json(content)

In [91]:
#TODO fit_on_text nur für x_train benutzen
#TODO anderen algo benutzen
#TODO Tokenizer -> TextVectorization
# tokenizer = load_tokenizer('../data/model/trained_tokenizer/')
tokenizer = Tokenizer()
tokenizer = train_tokenizer(X_train,tokenizer)
x_train = preprocess_student_pairs(X_train,tokenizer)
x_val = preprocess_student_pairs(X_val,tokenizer)
x_test = preprocess_student_pairs(X_test,tokenizer)
# X_test_df3 = preprocess_student_pairs()


In [92]:
tokenizer.texts_to_sequences(['include stdio.h include stdlib.h typedef struct _String char str unsigned int str_len String String arguments int arg_count char args String arguments int arg_count char args String arguments malloc sizeof String int count arguments[ ].str_len arguments[ ].str malloc sizeof char for int i i arg_count i char arg args[i] int j char c arg[j] while c if c x c X c y c Y c q c Q j c arg[j] continue arguments[ ].str[count] c count j c arg[j] arguments[ ].str_len count arguments[ ].str_len arguments[ ].str malloc sizeof char for int i i arguments[ ].str_len i if i% arguments[ ].str[arguments[ ].str_len] arguments[ ].str[i] arguments[ ].str_len return arguments int main int argc char argv String str arguments argc argv printf x y q und X Y Q aussortiert %s nDer neue String lautet %s str[ ].str str[ ].str return   include stdio.h int main int argc char argv char array [ ] char array [ ] int k for int i i argc i for int j argv[i][j] j if argv[i][j] x argv[i][j] argv[i][j] y argv[i][j] q argv[i][j] X argv[i][j] Y argv[i][j] Q array[k] argv[i][j] k array[k] printf x y q und X Y Q aussortiert %s n array for int i i k i array [i] array[ i] printf Der neue String lautet %s array '])

[[20,
  27,
  17,
  20,
  63,
  17,
  209,
  242,
  11,
  6,
  3,
  210,
  2,
  3,
  44,
  11,
  11,
  43,
  2,
  61,
  21,
  6,
  85,
  11,
  43,
  2,
  61,
  21,
  6,
  85,
  11,
  43,
  86,
  69,
  11,
  2,
  21,
  43,
  3,
  44,
  43,
  3,
  86,
  69,
  6,
  10,
  2,
  1,
  1,
  61,
  21,
  1,
  6,
  61,
  85,
  1,
  2,
  4,
  6,
  40,
  61,
  4,
  36,
  40,
  13,
  40,
  8,
  40,
  8,
  40,
  7,
  40,
  7,
  40,
  9,
  40,
  9,
  4,
  40,
  61,
  4,
  48,
  43,
  3,
  21,
  40,
  21,
  4,
  40,
  61,
  4,
  43,
  3,
  44,
  21,
  43,
  3,
  44,
  43,
  3,
  86,
  69,
  6,
  10,
  2,
  1,
  1,
  43,
  3,
  44,
  1,
  13,
  1,
  43,
  3,
  43,
  3,
  44,
  43,
  3,
  1,
  43,
  3,
  44,
  54,
  43,
  2,
  22,
  2,
  18,
  6,
  5,
  11,
  3,
  43,
  18,
  5,
  12,
  8,
  7,
  9,
  23,
  8,
  7,
  9,
  24,
  14,
  75,
  25,
  11,
  26,
  14,
  3,
  3,
  3,
  3,
  54,
  20,
  27,
  17,
  2,
  22,
  2,
  18,
  6,
  5,
  6,
  34,
  6,
  34,
  2,
  28,
  10,
  2,
  1,
  1,
  18,
  1,
  10

### Save a Tokenizer

In [34]:
def save_tokenizer(tokenizer,parent_folder):
    with open(parent_folder + 'tokenizer.json', 'w', encoding='utf-8') as file:
        file.write(json.dumps(tokenizer.to_json(), ensure_ascii=False))

# save_tokenizer(tokenizer,'../data/model/trained_tokenizer/')

### My Model


In [93]:
class simple_nn():
    def __init__(self) -> None:
        self.model = 0
    
    def create_simple_nn_model(self):
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.Input(shape=(256*2,), dtype='int32')),
        self.model.add(tf.keras.layers.Dense(256*2, activation='relu')),
        self.model.add(tf.keras.layers.Dense(256*2, activation='relu')),
        self.model.add(tf.keras.layers.Dense(256*2, activation='relu')),
        # self.model.add(tf.keras.layers.Dense(4, activation='relu')),
        self.model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        # self.model.build(input_shape=(1,256))
        # plot_model(self.model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

    def load_nn_model(self):
        path_to_model = '../data/model/new_trained_simple_model'
        from keras.models import load_model
        self.model = load_model(path_to_model)

    def save_nn_model(self):
        path_to_model = '../data/model/new_trained_simple_model'
        if os.path.isfile(path_to_model) is False:
            self.model.save(path_to_model)

### Load our trained model


In [69]:
Model = simple_nn()


### Save our trained model


In [59]:
Model.sav

INFO:tensorflow:Assets written to: ../data/model/new_trained_simple_model/assets


### Create our simple nn model

In [94]:
# Model.load_nn_model()
Model.create_simple_nn_model()

In [95]:
print(Model.model)

<keras.engine.sequential.Sequential object at 0x7f8e9c300be0>


In [96]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(189, 512)
(189, 1)
(48, 512)
(48, 1)


In [97]:
Model.model.fit(x_train, y_train.values.reshape(-1,1), batch_size=32, epochs=5, validation_data=(x_val,y_val.values.reshape(-1,1)))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8e9c1a84c0>

In [74]:
print(len(X_train['code12_padded']))
print(len(X_train['code12_padded'][0]))
print(X_train['code12_padded'].dtype)
print(X_train['code12_padded'][0].dtype)

# def show_shapes(): # can make yours to take inputs; this'll use local variable values
#     print("Expected: (num_samples, timesteps, channels)")
#     print("Sequences: {}".format(Sequences.shape))
#     print("Targets:   {}".format(Targets.shape))

[print(i.shape, i.dtype) for i in Model.model.inputs]
[print(o.shape, o.dtype) for o in Model.model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in Model.model.layers]


189
512
object
int32
(None, 512) <dtype: 'int32'>
(None, 1) <dtype: 'float32'>
dense_4 (None, 512) float32
dense_5 (None, 512) float32
dense_6 (None, 512) float32
dense_7 (None, 512) float32


[None, None, None, None]

In [20]:
Model.model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               262656    
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 dense_2 (Dense)             (None, 512)               262656    
                                                                 
 dense_3 (Dense)             (None, 1)                 513       
                                                                 
Total params: 788,481
Trainable params: 788,481
Non-trainable params: 0
_________________________________________________________________


### Evaluate my Model


In [98]:
Model.model.evaluate(x_test, y_test.values)
Model.model.evaluate(x_val, y_val.values)




[14.631500244140625, 0.4791666567325592]

## My first Baseline Model


## Simple Text Classification


In [None]:
#https://www.tensorflow.org/tutorials/keras/text_classification
# model = tf.keras.Sequential([
#   layers.Embedding(max_features + 1, embedding_dim),
#   layers.Dropout(0.2),
#   layers.GlobalAveragePooling1D(),
#   layers.Dropout(0.2),
#   layers.Dense(1)])


### Quora siames model


In [99]:
from keras.regularizers import l2
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Conv2D, ZeroPadding2D, Activation, Input, concatenate
from keras.models import Model

from keras.layers import BatchNormalization
# from tensorflow.keras.layers import (
#     BatchNormalization, SeparableConv2D, MaxPooling2D, Activation, Flatten, Dropout, Dense
# )
from keras.layers.pooling import MaxPooling2D
from keras.layers import concatenate
from keras.layers.core import Lambda, Flatten, Dense
from keras.initializers import glorot_uniform
from keras.layers import Input, Dense, Flatten, GlobalMaxPool2D, GlobalAvgPool2D, Concatenate, Multiply, Dropout, Subtract, Add, Conv2D
from keras import backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

from sklearn.metrics import roc_auc_score

def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)


In [100]:
#https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
embeddings_index = {}
f = open('/home/erik/Downloads/test_pretrained/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [101]:
not_present_list = []
vocab_size = len(tokenizer.word_index) + 1
print('Loaded %s word vectors.' % len(embeddings_index))
embedding_matrix = np.zeros((vocab_size, len(embeddings_index['no'])))
for word, i in tokenizer.word_index.items():
    if word in embeddings_index.keys():
        embedding_vector = embeddings_index.get(word)
    else:
        not_present_list.append(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.zeros(300)


Loaded 400000 word vectors.


In [49]:
# x_array
len(y_train.values)

189

In [None]:
input_1 = Input(shape=(x_train.shape[1],))
input_2 = Input(shape=(x_train.shape[1],))


common_embed = Embedding(name="synopsis_embedd",input_dim =len(tokenizer.word_index)+1, 
                       output_dim=len(embeddings_index['no']),weights=[embedding_matrix], 
                       input_length=x_train.shape[1],trainable=False) 
lstm_1 = common_embed(input_1)
lstm_2 = common_embed(input_2)


common_lstm = LSTM(64,return_sequences=True, activation="relu")
vector_1 = common_lstm(lstm_1)
vector_1 = Flatten()(vector_1)

vector_2 = common_lstm(lstm_2)
vector_2 = Flatten()(vector_2)


In [None]:

x3 = Subtract()([vector_1, vector_2])
x3 = Multiply()([x3, x3])

x1_ = Multiply()([vector_1, vector_1])
x2_ = Multiply()([vector_2, vector_2])
x4 = Subtract()([x1_, x2_])
    
    #https://stackoverflow.com/a/51003359/10650182
x5 = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([vector_1, vector_2])
    
conc = Concatenate(axis=-1)([x5, x4, x3])

x = Dense(100, activation="relu", name='conc_layer')(conc)
x = Dropout(0.01)(x)
out = Dense(1, activation="sigmoid", name = 'out')(x)

quora_model = Model([input_1, input_2], out)

quora_model.compile(loss="binary_crossentropy", metrics=['acc',auroc], optimizer=Adam(0.00001))

In [None]:
quora_model.fit(x_train, y_train.values.reshape(-1,1), batch_size=32, epochs=5, validation_data=(x_val,y_val.values.reshape(-1,1)))


### Reproduce the binary accuracy


In [None]:
# tf.keras.metrics.BinaryAccuracy()
m = tf.keras.metrics.BinaryAccuracy()
sum = 0
zeros = 0
ones = 0
for p,l in zip(predict, label[9000:15000]):
    m.update_state([[l]],[[p]])
    print(f"p: {p}; l: {l}; accuracy: {m.result().numpy()}")
    if m.result().numpy():
        sum = sum + 1
        if l:
            ones = ones + 1
        else:
            zeros = zeros + 1
    m.reset_state()
print(f"total accuracy: {sum/(15000-9000)}")
print(f"number positive accuracy through zeros (round off): {zeros}")
print(f"number positive accuracy through ones (round up): {ones}")

## InifiniteMonkey


In [None]:
# maybe https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
# def build_infinitemonkey_baseline():
    # n grams
    # tf-idf weighting
    # cosine simularity
    # grid search


# def build_infinitemonkey_model():
    # 2x encoder
    # embedding (32)
    # BatchNorm
    # LSTM(128)
    # 2x comparison module
    # Concat the output of the two encoders as input
    # BatchNorm
    # Dense (128)
    # ReLu
    # 1x network
    # Sum the output of the two comparison moduels as input
    # BatchNorm
    # Dense (128)
    # ReLu
    # BatchNorm
    # Dense(1)
    # Sigmoid


### Plot the predictions


In [38]:
# print(predict)
# %matplotlib
# %matplotlib inline
# import matplotlib.pyplot as plt
# plt.figure(figsize = (10,5))
# plt.plot(predict)

# plt.savefig('predictions_of_test_data.jpg')
# step_size = np.arange(0.05,0.95,0.05)
# step_size[1]
# # plt.bar(step_size, np.round(predict,2))
# for i in range(0,10):
#     np.round(predict,2)

Using matplotlib backend: TkAgg


<Figure size 720x360 with 0 Axes>