**librabries**

In [None]:
# Install required libraries
!pip install PyPDF2 python-docx nltk scikit-learn

# Import required libraries
import PyPDF2
import docx
import os
from google.colab import files
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.core.display import display, HTML

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfFileReader(file)
        for page_num in range(reader.numPages):
            page = reader.getPage(page_num)
            text += page.extract_text()
    return text

def extract_text_from_word(docx_path):
    doc = docx.Document(docx_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return '\n'.join(text)


In [None]:
# Upload PDF or Word files for the database
uploaded_files = files.upload()

# Extract text from the uploaded files
database_documents = []

for filename in uploaded_files.keys():
    if filename.endswith('.pdf'):
        text = extract_text_from_pdf(filename)
    elif filename.endswith('.docx'):
        text = extract_text_from_word(filename)
    else:
        continue
    database_documents.append(text)

# Display the extracted text for verification
for i, doc in enumerate(database_documents):
    print(f"Document {i+1} content:\n{doc[:500]}...\n")  # Display first 500 characters for brevity


Saving DATABASE1NLP.docx to DATABASE1NLP (3).docx
Document 1 content:
The development of artificial intelligence (AI) has revolutionized various industries. AI enables machines to learn from experience, adjust to new inputs, and perform human-like tasks. It is now widely used in applications such as speech recognition, language translation, and image processing. The potential of AI continues to grow as researchers develop more advanced algorithms and explore new areas of application.



...



In [None]:
# Upload the submitted document
submitted_file = files.upload()

# Extract the submitted document text
submitted_text = ""
for filename in submitted_file.keys():
    if filename.endswith('.pdf'):
        submitted_text = extract_text_from_pdf(filename)
    elif filename.endswith('.docx'):
        submitted_text = extract_text_from_word(filename)

# Display the submitted text for verification
print("Submitted Document content:\n", submitted_text[:500], "...\n")  # Display first 500 characters for brevity



Saving DOCUMENTNLP.docx to DOCUMENTNLP (5).docx
Submitted Document content:
 The advancement of artificial intelligence has greatly impacted multiple sectors. AI allows machines to learn from their experiences, adapt to new data, and carry out tasks that typically require human intelligence. This technology is now employed in various fields, including speech recognition, language translation, and image processing. As researchers continue to develop sophisticated algorithms, the potential applications of AI are expanding.


 ...



In [None]:
def preprocess_text(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)


In [None]:
def calculate_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix[0, 1]


In [None]:
def detect_plagiarism(submitted_doc, database_docs):
    preprocessed_submitted_doc = preprocess_text(submitted_doc)

    results = []
    for doc in database_docs:
        preprocessed_doc = preprocess_text(doc)
        similarity = calculate_similarity(preprocessed_submitted_doc, preprocessed_doc)
        results.append(similarity)

    return results

# Preprocess the submitted document
preprocessed_submitted_text = preprocess_text(submitted_text)

# Detect plagiarism
similarities = detect_plagiarism(preprocessed_submitted_text, database_documents)

# Display results
for i, similarity in enumerate(similarities):
    print(f"Similarity with Document {i+1}: {similarity*100:.2f}%")


Similarity with Document 1: 45.61%


In [None]:
import difflib

def highlight_similarities(doc1, doc2):
    seq_matcher = difflib.SequenceMatcher(None, doc1, doc2)
    matching_blocks = seq_matcher.get_matching_blocks()

    similar_phrases = []
    for match in matching_blocks:
        start = match.a
        end = match.a + match.size
        similar_phrase = doc1[start:end]
        if similar_phrase:
            similar_phrases.append(similar_phrase)

    return similar_phrases

# Example of highlighting similar sections in the first document
highlighted_sections = highlight_similarities(submitted_text, database_documents[0])
print("Plagiarized Sections:", highlighted_sections)


Plagiarized Sections: ['The ', 'v', 'AI', '.', ' AI ', ' exp', '.\n\n\n']


In [None]:
# Highlight plagiarized sections
def highlight_similarities(submitted_text, database_docs):
    # Prepare a list to keep track of highlighted ranges
    highlighted_ranges = []

    for doc in database_docs:
        seq_matcher = difflib.SequenceMatcher(None, submitted_text, doc)
        matching_blocks = seq_matcher.get_matching_blocks()

        for match in matching_blocks:
            if match.size > 0:
                start = match.a
                end = start + match.size
                highlighted_ranges.append((start, end))

    # Remove duplicate and overlapping ranges
    highlighted_ranges = merge_ranges(highlighted_ranges)

    # Create highlighted text with <mark> tags
    highlighted_text = []
    last_end = 0

    for start, end in highlighted_ranges:
        highlighted_text.append(submitted_text[last_end:start])
        highlighted_text.append(f'<mark>{submitted_text[start:end]}</mark>')
        last_end = end

    highlighted_text.append(submitted_text[last_end:])

    return ''.join(highlighted_text)

def merge_ranges(ranges):
    if not ranges:
        return []

    # Sort ranges by start position
    ranges.sort(key=lambda x: x[0])

    merged_ranges = []
    current_start, current_end = ranges[0]

    for start, end in ranges[1:]:
        if start <= current_end:
            current_end = max(current_end, end)
        else:
            merged_ranges.append((current_start, current_end))
            current_start, current_end = start, end

    merged_ranges.append((current_start, current_end))

    return merged_ranges

In [None]:
# Preprocess the submitted document
preprocessed_submitted_text = preprocess_text(submitted_text)

# Detect plagiarism and highlight plagiarized sections
highlighted_submitted_text = highlight_similarities(submitted_text, database_documents)

# Display highlighted results
display(HTML(f"<p>{highlighted_submitted_text}</p>"))