<a href="https://colab.research.google.com/github/Areeff10/plagiarism-detection-lstm-gru/blob/main/plag_pred_gru_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing Libraries

In [None]:
import pandas as pd
import numpy as numpy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM,GRU,Dropout,Embedding,Dense
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from sklearn.metrics import f1_score
import seaborn as sns

### Loading dataset

In [None]:
dataset=pd.read_csv('/content/plag dataset.txt',delimiter='\t',names=['source_txt','plagiarism_txt','label'], on_bad_lines='skip')
dataset.head(5)

Unnamed: 0,source_txt,plagiarism_txt,label
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
2,Children smiling and waving at camera,There are children present,1
3,Children smiling and waving at camera,The kids are frowning,0
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0


In [None]:
dataset.shape

(367373, 3)

In [None]:
dataset.isnull().sum()

Unnamed: 0,0
source_txt,0
plagiarism_txt,4
label,0


In [None]:
dataset=dataset.dropna()

In [None]:
dataset.drop_duplicates(inplace=True)

In [None]:
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Data Preprocessing

In [None]:
def token(text):
  text=text.lower()
  text=word_tokenize(text)
  return text
dataset['source_txt']=dataset['source_txt'].map(token)
dataset['plagiarism_txt']=dataset['plagiarism_txt'].map(token)

In [None]:
def preprocess(text):
  stop = list(set(stopwords.words('english'))) + list(punctuation)
  y=[]
  for i in text:
    if i not in stop:
      y.append(i)
  return ''.join(y)
dataset['source_txt']=dataset['source_txt'].map(preprocess)
dataset['plagiarism_txt']=dataset['plagiarism_txt'].map(preprocess)

### Texts to sequence/numbers

In [None]:
token=Tokenizer()
token.fit_on_texts(dataset['source_txt'] + '' + dataset['plagiarism_txt'])
input_data=token.texts_to_sequences(dataset['source_txt'] + '' + dataset['plagiarism_txt'])
output_data=dataset['label']

In [None]:
input_data=pad_sequences(input_data,padding='post')

In [None]:
input_data[0]

array([    1,    38,     6,     1,   179,   194,    77,     1,  1244,
          36,   746,     1,    38,     4,    14,     1,  2140,  2278,
          17, 17997,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0], dtype=int32)

In [None]:
size=len(token.word_index)+1
size

34279

# LSTM

In [None]:
lstm=Sequential()
lstm.add(Embedding(input_dim=size,output_dim=100))

lstm.add(LSTM(units=128,return_sequences=True))
lstm.add(LSTM(units=64,return_sequences=False))

lstm.add(Dense(units=128, activation="relu"))
lstm.add(Dropout(0.2))

lstm.add(Dense(units=64, activation="relu"))
lstm.add(Dropout(0.2))

lstm.add(Dense(units=1,activation='sigmoid'))

lstm.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
lstm.summary()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(input_data,output_data,test_size=0.2)

In [None]:
lstm.fit(x=input_data,y=output_data,epochs=5,validation_data=(x_test,y_test))

Epoch 1/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 16ms/step - accuracy: 0.4978 - loss: 0.6934 - val_accuracy: 0.4973 - val_loss: 0.6932
Epoch 2/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 16ms/step - accuracy: 0.4986 - loss: 0.6932 - val_accuracy: 0.4973 - val_loss: 0.6932
Epoch 3/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 16ms/step - accuracy: 0.5005 - loss: 0.6932 - val_accuracy: 0.4973 - val_loss: 0.6932
Epoch 4/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 16ms/step - accuracy: 0.5005 - loss: 0.6932 - val_accuracy: 0.5027 - val_loss: 0.6931
Epoch 5/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 16ms/step - accuracy: 0.5006 - loss: 0.6932 - val_accuracy: 0.5027 - val_loss: 0.6931


<keras.src.callbacks.history.History at 0x79fda7eba560>

In [None]:
pred=lstm.predict(output_data)

[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2ms/step


In [None]:
for i in range(0,len(pred)):
  if pred[i]>=0.5:
    pred[i]=1
  else:
    pred[i]=0

In [None]:
f1_score(output_data,pred)*100

66.56568889147495

# GRU

In [None]:
gru=Sequential()
gru.add(Embedding(input_dim=size,output_dim=100))

gru.add(GRU(units=128,return_sequences=True))
gru.add(GRU(units=64,return_sequences=False))

gru.add(Dense(units=128,activation='relu'))
gru.add(Dropout(0.2))

gru.add(Dense(units=64,activation='relu'))
gru.add(Dropout(0.2))

gru.add(Dense(units=1,activation='sigmoid'))

gru.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

gru.summary()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(input_data,output_data,test_size=0.2)

In [None]:
gru.fit(x=input_data,y=output_data,epochs=5,validation_data=(x_test,y_test))

Epoch 1/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 16ms/step - accuracy: 0.5023 - loss: 0.6934 - val_accuracy: 0.4957 - val_loss: 0.6932
Epoch 2/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 15ms/step - accuracy: 0.4994 - loss: 0.6932 - val_accuracy: 0.4957 - val_loss: 0.6932
Epoch 3/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 16ms/step - accuracy: 0.5005 - loss: 0.6932 - val_accuracy: 0.5043 - val_loss: 0.6931
Epoch 4/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 17ms/step - accuracy: 0.5005 - loss: 0.6932 - val_accuracy: 0.5043 - val_loss: 0.6931
Epoch 5/5
[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 16ms/step - accuracy: 0.4999 - loss: 0.6932 - val_accuracy: 0.4957 - val_loss: 0.6933


<keras.src.callbacks.history.History at 0x79fda756f160>

In [None]:
predt2=gru.predict(output_data)

[1m11467/11467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step


In [None]:
for i in range(0,len(predt2)):
  if predt2[i]>=0.5:
    predt2[i]=1
  else:
    predt2[i]=0


In [None]:
f1_score(output_data,predt2)

0.6656568889147495

### Prediction using LSTM and GRU

In [None]:
def convert(text):
  token=Tokenizer()
  token.fit_on_texts(text)
  input_data=token.texts_to_sequences(text)
  input_data=pad_sequences(input_data,padding='post')
  return input_data

source_text= 'Two blond women are hugging one another.'
plagiarism_txt='There are women showing affection.'

data=convert(source_text+''+plagiarism_txt)

pred1=lstm.predict(data)
pred2=gru.predict(data)

print("prediction using LSTM")
if pred1[0][0]>=0.5:
  print('1')
else:
  print('0')

print("prediction using GRU")
if pred2[0][0]>=0.5:
  print('1')
else:
  print('0')



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 145ms/step




[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 247ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 141ms/step
prediction using LSTM
1
prediction using GRU
1
