In [1]:
!pip install PyPDF2 python-docx scikit-learn ipywidgets

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2
from docx import Document
from IPython.display import display
from ipywidgets import FileUpload, Output, VBox, Button

# Initialize an empty list to store file contents and names
sample_contents = []
file_names = []

# Create an Output widget to display messages
output = Output()

# Function to process uploaded file
def process_uploaded_file(change):
    with output:
        uploaded_filename = next(iter(file_upload.value))
        uploaded_file = file_upload.value[uploaded_filename]['content']

        if uploaded_filename.endswith('.pdf'):
            try:
                # Open the PDF file and extract text
                reader = PyPDF2.PdfReader(BytesIO(uploaded_file))
                pdf_text = ''
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    pdf_text += page.extract_text()

                sample_contents.append(pdf_text)
                file_names.append(uploaded_filename)
                print("PDF file uploaded and processed.")
            except Exception as e:
                print("Error:", e)
                print("Failed to extract text from the PDF.")
        elif uploaded_filename.endswith('.docx'):
            try:
                # Open the Word file and extract text
                doc = Document(BytesIO(uploaded_file))
                docx_text = ''
                for para in doc.paragraphs:
                    docx_text += para.text

                sample_contents.append(docx_text)
                file_names.append(uploaded_filename)
                print("Word file uploaded and processed.")
            except Exception as e:
                print("Error:", e)
                print("Failed to extract text from the Word document.")
        else:
            # If it's not a PDF or Word document, assume it's a text file and read its contents
            uploaded_contents = uploaded_file.decode('utf-8')  # Decode binary content to text
            sample_contents.append(uploaded_contents)
            file_names.append(uploaded_filename)
            print("Text file uploaded and processed.")

        file_upload.value.clear()  # Clear the uploaded files

# Create a new FileUpload widget for each upload
file_upload = FileUpload(accept='.txt,.pdf,.docx', multiple=False, description='Upload File')
file_upload.observe(process_uploaded_file, names='value')

# Function to perform plagiarism check
def check_plagiarism(b):
    with output:
        if len(sample_contents) < 2:
            print("Please upload at least two documents for plagiarism comparison.")
            return

        # Define the vectorize function to convert text to TF-IDF vectors
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(sample_contents)

        # Calculate cosine similarity between document pairs
        similarity_matrix = cosine_similarity(vectors)

        # Print plagiarism results
        for i in range(len(file_names)):
            for j in range(i + 1, len(file_names)):
                similarity_score = similarity_matrix[i][j]
                print(f"Similarity between '{file_names[i]}' and '{file_names[j]}': {similarity_score:.2f}")

# Create a button to trigger plagiarism check
check_button = Button(description="Check Plagiarism")
check_button.on_click(check_plagiarism)

# Display the widgets
VBox([file_upload, check_button, output])


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m204.8/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[

VBox(children=(FileUpload(value={}, accept='.txt,.pdf,.docx', description='Upload File'), Button(description='…