# Import Modules

In [1]:
import numpy as np
import os
import re
import string
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import string
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Load Dataset

In [2]:
# Load the dataset from a tab-separated file
data = pd.read_csv('train_snli.txt', sep='\t', header=None,names=['source_text', 'plagiarized_text', 'label'])

In [3]:
data.head()

Unnamed: 0,source_text,plagiarized_text,label
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
2,Children smiling and waving at camera,There are children present,1
3,Children smiling and waving at camera,The kids are frowning,0
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0


In [4]:
data['label'].value_counts()

label
0    183966
1    183407
Name: count, dtype: int64

In [5]:
data.dropna(inplace=True)

In [6]:
data.shape

(367369, 3)

# Cleaning

In [7]:
def preprocess_series(series):
    # Drop NaNs and convert to lowercase
    series = series.fillna("").str.lower()

    # Remove punctuation
    series = series.str.replace(f"[{string.punctuation}]", "", regex=True)

    # Remove stopwords
    series = series.apply(lambda text: " ".join(
        word for word in text.split() if word not in ENGLISH_STOP_WORDS
    ))
    return series


In [8]:
#import swifter
#data["source_text"] = data["source_text"].swifter.apply(preprocess_text)
#data["plagiarized_text"] = data["plagiarized_text"].swifter.apply(preprocess_text)


# Apply fast vectorized preprocessing
data['source_text'] = preprocess_series(data['source_text'])
data['plagiarized_text'] = preprocess_series(data['plagiarized_text'])

In [9]:
combined_text = data['source_text'] + " " + data['plagiarized_text']

# Vectorization

In [10]:
# Vectorize
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(combined_text)

In [11]:
y = data['label']

In [12]:
#tfidf_vectorizer = TfidfVectorizer()
#X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])

# Training

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SLM Models

In [14]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.59      0.62     36586
           1       0.63      0.70      0.66     36888

    accuracy                           0.64     73474
   macro avg       0.64      0.64      0.64     73474
weighted avg       0.64      0.64      0.64     73474

Confusion Matrix:
[[21404 15182]
 [11087 25801]]


In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.64247216702507


# Save SLM  and Vectorizor

In [16]:
import pickle
# Save the trained model
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

In [17]:
# Save the TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

# import pickle

with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)
Load Model and Vectorizer

In [18]:
model = pickle.load(open('model.pkl','rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl','rb'))

# Detection System output

In [19]:
def detect(input_text):
    vectorized_text = tfidf_vectorizer.transform([input_text])
    result = model.predict(vectorized_text)
    return "Plagiarim Detected" if result[0] == 1 else "No Plagiarism"

In [20]:
# example ( it is a plagarized text)
input_text = 'A person is outdoors, on a horse.'
detect(input_text)

'Plagiarim Detected'

In [21]:
# example ( it has no plagiarism)
input_text = 'woman is filling a suitcase.'
detect(input_text)

'No Plagiarism'

In [22]:
# example ( it has no plagarism)
input_text = 'The city has a lot of people in it'
detect(input_text)

'Plagiarim Detected'