<a href="https://colab.research.google.com/github/DakshSharma755/plagiarism_check-slm-/blob/main/plag_check(self_attention).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ruvelpereira/mit-plagairism-detection-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/mit-plagairism-detection-dataset


In [8]:
import pandas as pd

file_path = '/kaggle/input/mit-plagairism-detection-dataset/train_snli.txt'

# Load the dataset
data = []

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            s1, s2, label = parts
            combined_text = s1 + " " + s2
            data.append({'text': combined_text, 'label': int(label)})

df = pd.DataFrame(data)
print(df.head())


                                                text  label
0  A person on a horse jumps over a broken down a...      0
1  A person on a horse jumps over a broken down a...      1
2  Children smiling and waving at camera There ar...      1
3  Children smiling and waving at camera The kids...      0
4  A boy is jumping on skateboard in the middle o...      0


In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizerFast
from sklearn.metrics import accuracy_score
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [10]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('huggingface')  # This pulls the secret from Colab secrets
login(token=hf_token)


In [11]:
# Step 3: Tokenization
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

class PlagiarismDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

dataset = PlagiarismDataset(df['text'].tolist(), df['label'].tolist(), tokenizer)


In [7]:
# Step 4: Train/Test Split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [8]:
# Step 5: SLM Model with Self-Attention (lightweight)
class SLMWithSelfAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_classes=2):
        super(SLMWithSelfAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        attn_output, _ = self.attention(x, x, x, key_padding_mask=~attention_mask.bool())
        pooled = attn_output.mean(dim=1)  # Global average pooling
        return self.fc(pooled)

model = SLMWithSelfAttention(vocab_size=tokenizer.vocab_size).to(device)


In [9]:
# Step 6: Training Setup
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [13]:
# Step 7: Training Loop
for epoch in range(15):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Training Loss: {total_loss / len(train_loader):.4f}")


100%|██████████| 18369/18369 [01:22<00:00, 223.94it/s]


Epoch 1 - Training Loss: 0.4736


100%|██████████| 18369/18369 [01:21<00:00, 224.78it/s]


Epoch 2 - Training Loss: 0.4643


100%|██████████| 18369/18369 [01:21<00:00, 225.61it/s]


Epoch 3 - Training Loss: 0.4563


100%|██████████| 18369/18369 [01:22<00:00, 223.42it/s]


Epoch 4 - Training Loss: 0.4482


100%|██████████| 18369/18369 [01:21<00:00, 224.06it/s]


Epoch 5 - Training Loss: 0.4410


100%|██████████| 18369/18369 [01:21<00:00, 224.21it/s]


Epoch 6 - Training Loss: 0.4340


100%|██████████| 18369/18369 [01:21<00:00, 224.09it/s]


Epoch 7 - Training Loss: 0.4273


100%|██████████| 18369/18369 [01:21<00:00, 224.33it/s]


Epoch 8 - Training Loss: 0.4210


100%|██████████| 18369/18369 [01:22<00:00, 223.95it/s]


Epoch 9 - Training Loss: 0.4150


100%|██████████| 18369/18369 [01:21<00:00, 224.27it/s]


Epoch 10 - Training Loss: 0.4098


100%|██████████| 18369/18369 [01:22<00:00, 223.24it/s]


Epoch 11 - Training Loss: 0.4036


100%|██████████| 18369/18369 [01:21<00:00, 225.50it/s]


Epoch 12 - Training Loss: 0.3978


100%|██████████| 18369/18369 [01:21<00:00, 224.78it/s]


Epoch 13 - Training Loss: 0.3928


100%|██████████| 18369/18369 [01:21<00:00, 224.89it/s]


Epoch 14 - Training Loss: 0.3874


100%|██████████| 18369/18369 [01:21<00:00, 224.86it/s]

Epoch 15 - Training Loss: 0.3823





In [17]:
# Step 8: Evaluation
model.eval()
true_labels = []
pred_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)

        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(preds.cpu().numpy())

accuracy = accuracy_score(true_labels, pred_labels)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Validation Accuracy: 76.78%


In [15]:
# Step 9: Demo Inference
def predict_plagiarism(sentence1, sentence2):
    model.eval()
    combined = sentence1 + " [SEP] " + sentence2
    encoded = tokenizer(combined, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        pred = torch.argmax(output, dim=1).item()

    return "Plagiarised" if pred == 1 else "Not Plagiarised"

# Example usage
print(predict_plagiarism("The cat sat on the mat.", "The feline sat on the carpet."))


Not Plagiarised
