In [None]:
# Install required libraries
!pip install transformers datasets torch scikit-learn sentencepiece faker gradio

# Import Libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from datasets import Dataset
import pandas as pd
from faker import Faker
import random
import gradio as gr  #Gradio for UI

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Collecting gradio
  Downloading gradio-5.27.0-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_

In [None]:
# Check device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Initialize model and tokenizer
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

# Initialize Faker
fake = Faker()
Faker.seed(42)

# Updated Dataset Generation Function
def generate_sample_dataset(num_samples=100):
    data = []
    plagiarism_types = ['verbatim', 'paraphrase', 'cross_language', 'ai_generated', 'none']

    for _ in range(num_samples):
        original = fake.paragraph(nb_sentences=5) + " " + fake.sentence()
        source_doc = {
            'text': original,
            'metadata': {
                'paper_id': fake.uuid4(),
                'discipline': random.choice(['cs', 'physics', 'biology', 'linguistics']),
                'publication_year': random.randint(2010, 2023)
            }
        }

        # Generate suspicious document
        plag_type = random.choice(plagiarism_types)
        if plag_type == 'none':
            suspicious = fake.paragraph(nb_sentences=5) + " " + fake.sentence()
            label = 0
        else:
            if plag_type == 'verbatim':
                if random.random() > 0.5:
                    suspicious = original  # Full verbatim copy
                else:
                    suspicious = original[:int(len(original)*0.8)]  # 80% verbatim copy
            elif plag_type == 'paraphrase':
                suspicious = original.replace("important", "crucial").replace("method", "approach")
            elif plag_type == 'cross_language':
                suspicious = original + " " + fake.paragraph(nb_sentences=2)
            elif plag_type == 'ai_generated':
                suspicious = original.upper() if random.random() > 0.5 else original.lower()
            label = 1

        data.append({
            'text1': original,
            'text2': suspicious,
            'label': label,
            'plagiarism_type': plag_type,
            'source_document_id': source_doc['metadata']['paper_id'],
            'suspicious_document_id': fake.uuid4()
        })

    return Dataset.from_pandas(pd.DataFrame(data))

# Generate dataset
dataset = generate_sample_dataset(100)

# Preprocess function
def preprocess_function(examples):
    encoded = tokenizer(examples['text1'], examples['text2'],
                        padding='max_length', truncation=True, max_length=512)
    return encoded

# Apply preprocessing
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
    no_cuda=not torch.cuda.is_available()
)

# Metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'precision': precision_score(p.label_ids, preds),
        'recall': recall_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds)
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Plagiarism checker function
def check_plagiarism(text1, text2, threshold=0.8):
    inputs = tokenizer(text1, text2, return_tensors='pt',
                      max_length=512, truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    similarity = probs[0][1].item()
    return {
        "similarity_score": similarity,
        "is_plagiarized": similarity > threshold,
        "threshold": threshold
    }

# Gradio Interface function
def plagiarism_interface(text1, text2, threshold):
    result = check_plagiarism(text1, text2, threshold)
    return result['similarity_score'], result['is_plagiarized']

# Create Gradio App
iface = gr.Interface(
    fn=plagiarism_interface,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter original text here..."),
        gr.Textbox(lines=5, placeholder="Enter suspicious text here..."),
        gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.8, label="Threshold")
    ],
    outputs=[
        gr.Number(label="Similarity Score"),
        gr.Label(label="Plagiarized?")
    ],
    title="BERT-Plag: Plagiarism Detector",
    description="Enter two texts to check for plagiarism. Adjust the threshold if needed."
)

iface.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mheydarshan2180[0m ([33mheydarshan2180-chandigarh-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://24dd7c8127e6c09123.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Test Case 1 (Plagiarized - Verbatim Copy 80%)

Original:

"Technology has fundamentally reshaped the way humans interact, communicate, and entertain themselves, building bridges that transcend geography and culture."

Suspicious:

"Technology has fundamentally reshaped the way humans interact, communicate, and entertain themselves, building bridges that transcend geography and culture."

Test Case 2 (Plagiarized - Paraphrase Copy)

Original:

"The economy thrives when innovation is prioritized, allowing new ideas to transform industries and create fresh opportunities."

Suspicious:

"Economic growth happens when creativity is encouraged, letting novel concepts change sectors and open new chances."

Test Case 3 (Plagiarized - Minor Verbatim with tweaks)

Original:

"Climate change poses an unprecedented threat to biodiversity, ecosystems, and human societies worldwide."

Suspicious:

"Climate change threatens biodiversity, ecosystems, and human communities across the globe."

Test Case 4 (Plagiarized - AI-Generated Style Change)

Original:

"Learning is a lifelong process that stretches far beyond classrooms, textbooks, and lectures."

Suspicious:

"LEARNING IS A LIFELONG PROCESS THAT STRETCHES FAR BEYOND CLASSROOMS, TEXTBOOKS, AND LECTURES."

✅ Non-plagiarized examples:
Test Case 5 (Not Plagiarized)

Original:

"The stars above painted a canvas of light, shimmering like jewels scattered across a vast velvet sky."

Suspicious:

"A quiet river reflected the moon’s soft glow, carrying whispers of the night into distant dreams."

Test Case 6 (Not Plagiarized)

Original:

"A skilled gardener knows patience, nurturing every seed with care until it blossoms into vibrant life."

Suspicious:

"The persistence of a mountain climber mirrors the quiet strength required to conquer life's steepest peaks."

Test Case 7 (Not Plagiarized)

Original:

"History is written not just by victors but by countless voices who refuse to be forgotten."

Suspicious:

"Art preserves the essence of human emotions, giving voice to the silent stories etched in time."

Test Case 8 (Not Plagiarized)

Original:

"Cities are living organisms, growing, evolving, and adapting to the needs and dreams of their inhabitants."

Suspicious:

"Forests breathe life into the planet, offering shelter, nourishment, and balance to the world."