In [31]:
# Import necessary modules!
import os  # Module for interacting with the operating system
from sklearn.feature_extraction.text import TfidfVectorizer  # Module for text vectorization using TF-IDF
from sklearn.metrics.pairwise import cosine_similarity  # Module for calculating cosine similarity

In [32]:
# Get a list of all text files in the current directory
student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]

In [33]:
# Read the contents of each student's text file
student_notes = [open(_file, encoding='utf-8').read() for _file in student_files]

In [34]:
# Function to vectorize the text using TF-IDF
def vectorize(Text):
    return TfidfVectorizer().fit_transform(Text).toarray()

In [35]:
# Function to calculate cosine similarity between two documents
def similarity(doc1, doc2):
    return cosine_similarity([doc1, doc2])

In [36]:
# Vectorize the student notes using TF-IDF
vectors = vectorize(student_notes)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()


In [37]:
# Function to check plagiarism among the student notes
def check_plagiarism():
    global s_vectors
    for student_a, text_vector_a in s_vectors:
        new_vectors = s_vectors.copy()
        current_index = new_vectors.index((student_a, text_vector_a))
        del new_vectors[current_index]
        for student_b, text_vector_b in new_vectors:
            # Calculate cosine similarity between two text vectors
            sim_score = similarity(text_vector_a, text_vector_b)[0][1]
            # Sort the student file names alphabetically to avoid duplicates
            student_pair = sorted((student_a, student_b))
            # Create a tuple with student file names and similarity score
            score = (student_pair[0], student_pair[1], sim_score)
            # Add the tuple to plagiarism_results set
            plagiarism_results.add(score)
    return plagiarism_results

In [38]:
# Print the plagiarism results
for data in check_plagiarism():
    print("Similarity data:\n", data)


Similarity data:
 ('Arthur.txt', 'Clark.txt', np.float64(0.5430431121089816))
Similarity data:
 ('Ben.txt', 'Clark.txt', np.float64(0.408904884400347))
Similarity data:
 ('Arthur.txt', 'Ben.txt', np.float64(0.4595329317649596))
