In [None]:
#=========== Import Libraries ===========#
import nltk
nltk.download("popular")
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Bidirectional, SpatialDropout1D, Dropout, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#=========== Load Dataset ===========#
data = pd.read_csv("/content/sample_data/PlagiarismDataset.csv")

In [None]:
#=========== Text Preprocessing ===========#
def preprocess_text(text):
    if not isinstance(text, str):  # Check if text is not a string
        return ""  # Replace non-string or NaN values with an empty string
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    stop_words = set(stopwords.words("english"))  # Define stopwords
    return " ".join(word for word in text.split() if word not in stop_words)

# Apply preprocessing to the DataFrame
data["source_text"] = data["source_text"].apply(preprocess_text)
data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)

# Combine source and plagiarized texts
data["combined_text"] = data["source_text"] + " " + data["plagiarized_text"]

# Remove rows with missing labels
data = data.dropna(subset=["label"])

# Convert labels to integers (if not already binary)
data["label"] = data["label"].astype(int)

# Check class distribution
print("Class distribution:\n", data["label"].value_counts())

In [None]:
#=========== Tokenization and Padding ===========#
MAX_NUM_WORDS = 10000  # Vocabulary siz
MAX_SEQUENCE_LENGTH = 100  # Max tokens per sequence
EMBEDDING_DIM = 100  # Embedding dimension

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(data["combined_text"])

sequences = tokenizer.texts_to_sequences(data["combined_text"])
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y = data["label"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#=========== Building & Training Model ===========#
def build_and_train_model(model_type):
    model = Sequential()
    model.add(Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(SpatialDropout1D(0.2))

    if model_type == "LSTM":
        model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)))
    elif model_type == "GRU":
        model.add(Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(GRU(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(GRU(32, dropout=0.2, recurrent_dropout=0.2)))
    elif model_type == "DNN":
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(32, activation='relu'))
    else:
        raise ValueError("Unknown model type. Choose 'LSTM', 'GRU', or 'DNN'.")

    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping to avoid overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model and save the history
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test),
                        callbacks=[early_stopping], verbose=1)
    return model, history

In [None]:
#=========== Building & Training Model ===========#
def build_and_train_model(model_type):
    model = Sequential()
    model.add(Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(SpatialDropout1D(0.2))

    if model_type == "LSTM":
        model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)))
    elif model_type == "GRU":
        model.add(Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(GRU(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(GRU(32, dropout=0.2, recurrent_dropout=0.2)))
    elif model_type == "DNN":
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(32, activation='relu'))
    else:
        raise ValueError("Unknown model type. Choose 'LSTM', 'GRU', or 'DNN'.")

    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping to avoid overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model and save the history
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test),
                        callbacks=[early_stopping], verbose=1)
    return model, history

In [None]:
#=========== Building & Training Model ===========#
def build_and_train_model(model_type):
    model = Sequential()
    model.add(Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(SpatialDropout1D(0.2))

    if model_type == "LSTM":
        model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)))
    elif model_type == "GRU":
        model.add(Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(GRU(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
        model.add(Bidirectional(GRU(32, dropout=0.2, recurrent_dropout=0.2)))
    elif model_type == "DNN":
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(32, activation='relu'))
    else:
        raise ValueError("Unknown model type. Choose 'LSTM', 'GRU', or 'DNN'.")

    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping to avoid overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model and save the history
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test),
                        callbacks=[early_stopping], verbose=1)
    return model, history

In [None]:
# For Predicting Custom Input Text (Original vs Plagiarized Text):
def predict_plagiarism(original_text, plagiarized_text, model, tokenizer, X_train):
    # Preprocess the original and plagiarized texts
    original_text_processed = preprocess_text(original_text)
    plagiarized_text_processed = preprocess_text(plagiarized_text)

    # Convert the preprocessed text into sequences
    original_text_sequence = tokenizer.texts_to_sequences([original_text_processed])
    plagiarized_text_sequence = tokenizer.texts_to_sequences([plagiarized_text_processed])

    # Pad the sequences to ensure uniform length
    original_text_padded = pad_sequences(original_text_sequence, maxlen=MAX_SEQUENCE_LENGTH)
    plagiarized_text_padded = pad_sequences(plagiarized_text_sequence, maxlen=MAX_SEQUENCE_LENGTH)

    # Make predictions using the trained model
    original_prediction = model.predict(original_text_padded)
    plagiarized_prediction = model.predict(plagiarized_text_padded)

    # Calculate cosine similarity between the plagiarized text and the training data (original)
    cosine_similarity_score = cosine_similarity(plagiarized_text_padded, X_train).max()

    # Interpret the predictions and similarity score
    if plagiarized_prediction[0] == 0:
        print("The plagiarized text is NOT plagiarized.")
    else:
        print(f"The plagiarized text is plagiarized with a similarity score of {cosine_similarity_score * 100:.2f}%.")

    if original_prediction[0] == 0:
        print("The original text is NOT plagiarized.")
    else:
        print("The original text is plagiarized.")  # This shouldn't typically be the case in this context.

# Example Usage for Input:
original_text = input("Enter the original text: ")
plagiarized_text = input("Enter the plagiarized text: ")

# Choose one of the trained models, e.g., "LSTM"
model_to_use = trained_models["LSTM"]  # Example with LSTM
predict_plagiarism(original_text, plagiarized_text, model_to_use, tokenizer, X_train)