mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


import libraries

In [None]:
!pip install hazm

In [184]:
from sklearn.metrics.pairwise import cosine_similarity
import os
import io
from hazm import *
import tqdm
import codecs
import gensim
from gensim.models import Word2Vec
import pandas as pd
import numpy as np

unzip data folder

In [None]:
!unrar x '/content/drive/MyDrive/Cheating Detection/PersianPlagdet2016-text-alignment-corpus.rar'

read data paths

In [154]:
folder_path = ['PersianPlagdet2016-text-alignment-train-corpus/src/', 'PersianPlagdet2016-text-alignment-train-corpus/susp/']

txt_files = []

for folder in folder_path:
  for root, dirs, files in os.walk(folder):
      for file in files:
          if file.endswith(".txt"):
              txt_files.append(os.path.join(root, file))

In [155]:
len(txt_files)

3088

Pre-process

In [156]:
txt_collection = {}

for file in txt_files:
    with io.open(file, 'r', encoding='utf-8') as f:
        txt_collection[file] = f.read()

Normalize + Tokenize

In [169]:
hazm_normalizer = Normalizer()

txt_sentences = {}
txt_norm_tokenize = {}

for file in tqdm.tqdm(txt_collection):

    normalized_txt = hazm_normalizer.normalize(txt_collection[file])
    sentences = sent_tokenize(normalized_txt)
    txt_sentences[file] = sentences
    temp = []

    for sent in sentences:
        temp += [word_tokenize(sent)]

    txt_norm_tokenize[file] = temp

100%|██████████| 3088/3088 [00:23<00:00, 132.75it/s]


remove stop words + Lemmatize

In [170]:
stop_path = '/content/drive/MyDrive/Cheating Detection/persian_stopwords.txt'
stopwords = [hazm_normalizer.normalize(x.strip()) for x in codecs.open(stop_path,'r','utf-8').readlines()]

In [171]:
lemmatizer = Lemmatizer()


tokens_nonstop = {}
all_tokens_nonstop = []

for file in tqdm.tqdm(txt_norm_tokenize):
    temp = []
    for token_list in txt_norm_tokenize[file]:
        temp2 = []
        temp2 += [lemmatizer.lemmatize(t.replace("_", "")) for t in token_list if t not in stopwords]
        all_tokens_nonstop += [temp2]

        if len(temp2) >= 3:
            temp += [temp2]

    tokens_nonstop[file] = temp

100%|██████████| 3088/3088 [00:11<00:00, 258.93it/s]


Word2Vec

In [174]:
model1 = gensim.models.Word2Vec(all_tokens_nonstop, min_count = 1, vector_size = 100, window = 5)

read pairs

In [175]:
pairs_dict = {'suspicious': [], 'source': [], 'Label': []}


files = [('/content/PersianPlagdet2016-text-alignment-train-corpus/01-no-plagiarism/pairs', 0),
         ('/content/PersianPlagdet2016-text-alignment-train-corpus/02-no-obfuscation/pairs', 0),
         ('/content/PersianPlagdet2016-text-alignment-train-corpus/03-random-obfuscation/pairs', 1),
         ('/content/PersianPlagdet2016-text-alignment-train-corpus/04-simulated-obfuscation/pairs', 1)]


for f, l in files:
    file = open(f, "r")

    content = file.read()

    for row in content.split('\n'):
        temp = row.split(' ')
        if len(temp) == 2:
            susp, src = temp
            pairs_dict['suspicious'] += [susp]
            pairs_dict['source'] += [src]
            pairs_dict['Label'] += [l]

    file.close()

In [176]:
df = pd.DataFrame(pairs_dict)

In [177]:
df

Unnamed: 0,suspicious,source,Label
0,suspicious-document00001100017.txt,source-document00001222909.txt,0
1,suspicious-document00001100593.txt,source-document00001183668.txt,0
2,suspicious-document0000110086.txt,source-document00001405402.txt,0
3,suspicious-document00001101968.txt,source-document00001213543.txt,0
4,suspicious-document00001102809.txt,source-document00001320655.txt,0
...,...,...,...
2744,suspicious-document0000187571.txt,source-document00001144833.txt,1
2745,suspicious-document0000187571.txt,source-document000012725.txt,1
2746,suspicious-document0000194397.txt,source-document00001307951.txt,1
2747,suspicious-document0000194397.txt,source-document00001308967.txt,1


In [179]:
def jaccard_similarity(list1, list2):

    set1 = set(list1)
    set2 = set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    return len(intersection)/len(union)

In [180]:
def sent_representation(path):

    sentence_rep = []

    for sent in tokens_nonstop[path]:
        sent_rep = np.zeros(100)
        for token in sent:
            sent_rep += model1.wv[token]

        if len(sent) != 0:
            sent_rep /= len(sent)

        sentence_rep += [sent_rep]

    return sentence_rep

In [181]:
def classifier(susp_path, plag_path):

    susp_sent_rep = sent_representation(susp_path)
    plag_sent_rep = sent_representation(plag_path)
    counter = 0

    for idx1 in range(len(susp_sent_rep)):
        for idx2 in range(len(plag_sent_rep)):
            cos_sim = cosine_similarity(susp_sent_rep[idx1].reshape(1, -1), plag_sent_rep[idx2].reshape(1, -1))[0][0]
            if cos_sim >= 0.3:
                if jaccard_similarity(tokens_nonstop[susp_path][idx1], tokens_nonstop[plag_path][idx2]) >= 0.2:
                    counter += 1

    # if counter >= (0.05 * len(susp_sent_rep)):
    if counter > 0:
        return 1
    else:
        return 0

In [182]:
acc = 0

h = 0

for index, row in df.iterrows():
    h += 1
    if h % 150 == 0:
        print(h)
    susp_path = os.path.join(folder_path[1], row['suspicious'])
    plag_path = os.path.join(folder_path[0], row['source'])

    label = row['Label']

    if classifier(susp_path, plag_path) == label:
        acc += 1

150
300
450
600
750
900
1050
1200
1350
1500
1650
1800
1950
2100
2250
2400
2550
2700


In [199]:
print('Accuracy : ', acc / df.shape[0])

Accuracy :  0.9054201527828302


find 5 most similar sources to suspicious Doc

In [193]:
susp_path = os.path.join(folder_path[1], 'suspicious-document0000194397.txt')

In [194]:
Documents_paths = []


for root, dirs, files in os.walk('PersianPlagdet2016-text-alignment-train-corpus/src/'):
      for file in files:
          if file.endswith(".txt"):
              Documents_paths.append(os.path.join(root, file))

In [195]:
susp_path

'PersianPlagdet2016-text-alignment-train-corpus/susp/suspicious-document0000194397.txt'

In [196]:
susp_sent_rep = sent_representation(susp_path)

Doc_sent_rep = {}

for path in Documents_paths:
    Doc_sent_rep[path] = sent_representation(path)

In [197]:
counter = []

for path in tqdm.tqdm(Documents_paths):
    c = 0
    for idx1 in range(len(susp_sent_rep)):
        for idx2 in range(len(Doc_sent_rep[path])):

            cos_sim = cosine_similarity(susp_sent_rep[idx1].reshape(1, -1), Doc_sent_rep[path][idx2].reshape(1, -1))[0][0]

            if cos_sim >= 0.3:
                if jaccard_similarity(tokens_nonstop[susp_path][idx1], tokens_nonstop[path][idx2]) >= 0.2:
                    c += 1

    counter += [c]

100%|██████████| 1563/1563 [15:35<00:00,  1.67it/s]


In [198]:
counter = np.array(counter)

top_5_indices = np.argsort(counter)[::-1][:5]
top_5_values = counter[top_5_indices]

for i in range(5):
    print(' (', Documents_paths[top_5_indices[i]], ') -> ', top_5_values[i])

 ( PersianPlagdet2016-text-alignment-train-corpus/src/source-document00001404542.txt ) ->  10
 ( PersianPlagdet2016-text-alignment-train-corpus/src/source-document00001307951.txt ) ->  9
 ( PersianPlagdet2016-text-alignment-train-corpus/src/source-document00001305979.txt ) ->  6
 ( PersianPlagdet2016-text-alignment-train-corpus/src/source-document00001308967.txt ) ->  6
 ( PersianPlagdet2016-text-alignment-train-corpus/src/source-document00001303908.txt ) ->  6
