In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import sys
import os
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
data_file = "dataset.csv"
if not os.path.exists(data_file):
    print(f"Error: The dataset file '{data_file}' was not found.")
    print(f"Please ensure 'dataset.csv' is uploaded and accessible.")
    sys.exit(1)
data = pd.read_csv(data_file)
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.lower()
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text
data["source_text"] = data["source_text"].apply(preprocess_text)
data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(cm)
try:
    with open("model.pkl", 'wb') as file:
        pickle.dump(model, file)
    with open('tfidf_vectorizer.pkl', 'wb') as file:
        pickle.dump(tfidf_vectorizer, file)

    with open('model.pkl', 'rb') as file:
        loaded_model = pickle.load(file)
    with open('tfidf_vectorizer.pkl', 'rb') as file:
        loaded_tfidf_vectorizer = pickle.load(file)
except Exception as e:
    print(f"Error saving/loading models: {e}")
def detect_plagiarism(input_text, trained_model, trained_tfidf_vectorizer, preprocess_func):
    processed_input_text = preprocess_func(input_text)
    vectorized_text = trained_tfidf_vectorizer.transform([processed_input_text])
    result = trained_model.predict(vectorized_text)
    return "Plagiarism Detected" if result[0] == 1 else "No Plagiarism"
if 'loaded_model' in locals() and 'loaded_tfidf_vectorizer' in locals():
    detection_model = loaded_model
    detection_vectorizer = loaded_tfidf_vectorizer
else:
    detection_model = model
    detection_vectorizer = tfidf_vectorizer
sample_text_1 = "This is an original document about machine learning concepts."
print(f"Text 1: '{sample_text_1}'")
print(f"Result: {detect_plagiarism(sample_text_1, detection_model, detection_vectorizer, preprocess_text)}")
sample_text_2 = "Artificial intelligence uses statistical techniques to give computers the ability to learn data without explicitly programmed."
print(f"\nText 2: '{sample_text_2}'")
print(f"Result: {detect_plagiarism(sample_text_2, detection_model, detection_vectorizer, preprocess_text)}")
sample_text_3 = "The quick brown fox jumps over the lazy dog."
print(f"\nText 3: '{sample_text_3}'")
print(f"Result: {detect_plagiarism(sample_text_3, detection_model, detection_vectorizer, preprocess_text)}")

Accuracy: 0.8243243243243243

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74


Confusion Matrix:
[[30  5]
 [ 8 31]]
Text 1: 'This is an original document about machine learning concepts.'
Result: No Plagiarism

Text 2: 'Artificial intelligence uses statistical techniques to give computers the ability to learn data without explicitly programmed.'
Result: No Plagiarism

Text 3: 'The quick brown fox jumps over the lazy dog.'
Result: No Plagiarism
