In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]

In [4]:
student_files

['Sample 1.txt', 'Sample 2.txt', 'sample 3.txt', 'sample 4.txt']

In [5]:
student_notes = [open(_file,encoding = 'utf-8').read() for _file in student_files]

In [6]:
student_notes

['In a quaint little town nestled amidst rolling hills and lush greenery, life unfolds at a tranquil pace. The scent of freshly bloomed flowers drifts through the air, dancing playfully with the gentle breeze. Friendly faces line the streets, exchanging warm smiles and heartfelt greetings, creating an atmosphere of belonging and unity. With quaint shops and cozy cafes dotting the landscape, the town exudes a sense of nostalgia, where time seems to slow down, inviting residents and visitors alike to savor each moment. As the sun sets, casting a golden glow over the horizon, the community gathers in the central square, embracing the joy of togetherness and celebrating the simple pleasures that make this charming town a true haven of contentment.',
 'My name is Bhushan and I am 4th year IT engineering student. ',
 'In a quaint little town nestled amidst rolling hills and lush greenery, life unfolds at a tranquil pace. The scent of freshly bloomed flowers drifts through the air, dancing pl

In [8]:
def vectorize(text):
    return TfidfVectorizer().fit_transform(text).toarray()

In [9]:
def similarity(doc1,doc2):
    return cosine_similarity([doc1,doc2])

In [10]:
vectors = vectorize(student_notes)

In [11]:
vectors

array([[0.        , 0.        , 0.05854348, 0.05854348, 0.        ,
        0.05854348, 0.05854348, 0.23249598, 0.05854348, 0.05854348,
        0.05854348, 0.05854348, 0.        , 0.05854348, 0.05854348,
        0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
        0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
        0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
        0.        , 0.        , 0.05854348, 0.05854348, 0.05854348,
        0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
        0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
        0.05854348, 0.05854348, 0.05854348, 0.11708696, 0.05854348,
        0.        , 0.        , 0.05854348, 0.05854348, 0.        ,
        0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.        ,
        0.05854348, 0.05854348, 0.        , 0.        , 0.        ,
        0.05854348, 0.05854348, 0.29271741, 0.        , 0.05854348,
        0.05854348, 0.05854348, 0.05854348, 0.11

In [12]:
s_vectors = list(zip(student_files,vectors))

In [13]:
s_vectors

[('Sample 1.txt',
  array([0.        , 0.        , 0.05854348, 0.05854348, 0.        ,
         0.05854348, 0.05854348, 0.23249598, 0.05854348, 0.05854348,
         0.05854348, 0.05854348, 0.        , 0.05854348, 0.05854348,
         0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
         0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
         0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
         0.        , 0.        , 0.05854348, 0.05854348, 0.05854348,
         0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
         0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.05854348,
         0.05854348, 0.05854348, 0.05854348, 0.11708696, 0.05854348,
         0.        , 0.        , 0.05854348, 0.05854348, 0.        ,
         0.05854348, 0.05854348, 0.05854348, 0.05854348, 0.        ,
         0.05854348, 0.05854348, 0.        , 0.        , 0.        ,
         0.05854348, 0.05854348, 0.29271741, 0.        , 0.05854348,
         0.05854

In [38]:
plagarism_results = set()

In [39]:
def check_plagiarism():
    global s_vectors
    for student_a,text_vector_a in s_vectors:
        new_vectors = s_vectors.copy()
        current_index = new_vectors.index((student_a,text_vector_a))
        #print(current_index)
        for student_b,text_vector_b in new_vectors:
            sim_score = similarity(text_vector_a,text_vector_b)[0][1]
            student_pair = sorted((student_a,student_b))
            #print(similarity(text_vector_a,text_vector_b))
            #print(student_pair)
            score = (student_pair[0],student_pair[1],sim_score)
            plagarism_results.add(score)
            
    return plagarism_results


In [40]:
for data in check_plagiarism():
    print(data)

('sample 3.txt', 'sample 4.txt', 0.06618774467519122)
('Sample 1.txt', 'sample 3.txt', 1.0000000000000004)
('Sample 2.txt', 'sample 4.txt', 0.33664807941867536)
('Sample 2.txt', 'sample 3.txt', 0.040995340205619554)
('sample 3.txt', 'sample 3.txt', 1.0000000000000004)
('sample 4.txt', 'sample 4.txt', 1.0000000000000002)
('Sample 1.txt', 'Sample 1.txt', 1.0000000000000004)
('Sample 2.txt', 'Sample 2.txt', 1.0)
('Sample 1.txt', 'sample 4.txt', 0.06618774467519122)
('Sample 1.txt', 'Sample 2.txt', 0.040995340205619554)


In [41]:
## Here sample 1.txt and sample 3.txt has same text in it, hence the plagiarism 
## score is 100%.