#Libraries

In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

#Functions

In [2]:
is_noun = lambda pos: pos[:2] == 'NN'

In [3]:
def noun_extraction(text):
    porter = PorterStemmer()
    sen_list = nltk.tokenize.sent_tokenize(text)
    noun_list = []
    for sen in sen_list:
        text_tokens = nltk.tokenize.word_tokenize(sen)
        tokens_without_stopword =[word for word in text_tokens if not word in stopwords.words()]
        tokens_without_punctuation =[word for word in tokens_without_stopword if word.isalnum()]
        nouns = [word for (word, pos) in nltk.pos_tag(tokens_without_punctuation) if is_noun(pos)] 
        sen_nouns_synonyms=[]
        for word in nouns:
            #stemming
            sen_nouns_synonyms.append(porter.stem(word))
            #adding synonyms to the list so that it can catch alternative words used
            for syn in wordnet.synsets(word):
                for l in syn.lemmas():
                    sen_nouns_synonyms.append(l.name())
        noun_list.append(sen_nouns_synonyms)
    return noun_list

In [4]:
def plag_percentage(orignal,target):
    ds_1=list(set().union(*orignal))
    ds_2=list(set().union(*target))
    plag_percentage = len(list(set(ds_1) & set(ds_2)))/len(list(set(ds_1).union(set(ds_2))))*100
    print("percentage of plagiarism found",plag_percentage)
    print()
    print("The copied concepts are")
    print(list(set(ds_1) & set(ds_2)))

In [5]:
#function for localization of copied content within the document
def localization(sen_list,copied_content,tolerance):    
    copied_sen = []
    print("plagiarism percentage of each sentence from start")
    for lists in sen_list:
        plag_percentage1 = len(list(set(copied_content) & set(lists)))/len(list(set(copied_content).union(set(lists))))*100
        print(" ",plag_percentage1)
        if plag_percentage1 > tolerance:
            copied_sen.append(sen_list.index(lists)+1)
    return copied_sen

In [6]:
def return_plag_section(sen_list_orignal,sen_list_target,tolerance):
    copied_content=list(set().union(*sen_list_orignal) & set().union(*sen_list_target))
    copied_sen1=localization(sen_list_orignal,copied_content,tolerance)
    copied_sen2=localization(sen_list_target,copied_content,tolerance)
    print("The most copied sentences are",list(set(copied_sen1) & set(copied_sen2)))

#InterFace

###Documents

In [None]:
original_document="The legal system is made up of civil courts, criminal courts and specialty courts such as family law courts and bankruptcy court. Each court has its own jurisdiction, which refers to the cases that the court is allowed to hear. In some instances, a case can only be heard in one type of court. For example, a bankruptcy case must be heard in a bankruptcy court. In other instances, there may be several potential courts with jurisdiction. For example, a federal criminal court and a state criminal court would each have jurisdiction over a crime that is a federal drug offense but that is also an offense on the state level." 

In [None]:
plagerised_document="The legal system is comprised of criminal and civil courts and specialty courts like bankruptcy and family law courts. Every one of the courts is vested with its own jurisdiction. Jurisdiction means the types of cases each court is permitted to rule on. Sometimes, only one type of court can hear a particular case. For instance, bankruptcy cases an be ruled on only in bankruptcy court. In other situations, it is possible for more than one court to have jurisdiction. For instance, both a state and federal criminal court could have authority over a criminal case that is illegal under federal and state drug laws."

In [7]:
original_document="In ages which have no record these islands were the home of millions of happy birds. the resort of a hundred times more millions of fishes, of sea lions, and other creatures whose names are not so common; the marine residence, in fact, of innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer. and a store of quite another sort for an immaculate Republican government"

In [8]:
plagerised_document="Long ago, when there was no written history, these islands were the home of millions of happy birds; the resort of a hundred times more millions of fishes, sea lions, and other creatures. Here lived innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer, and a store of quite another sort for an immaculate Republican government."

In [None]:
original_document

'In ages which have no record these islands were the home of millions of happy birds. the resort of a hundred times more millions of fishes, of sea lions, and other creatures whose names are not so common; the marine residence, in fact, of innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer. and a store of quite another sort for an immaculate Republican government'

In [None]:
plagerised_document

'Long ago, when there was no written history, these islands were the home of millions of happy birds; the resort of a hundred times more millions of fishes, sea lions, and other creatures. Here lived innumerable creatures predestined from the creation of the world to lay up a store of wealth for the British farmer, and a store of quite another sort for an immaculate Republican government.'

###Evaluvation/Result

In [9]:
ori_feature=noun_extraction(original_document)

In [10]:
plag_feature=noun_extraction(plagerised_document)

In [18]:
plag_percentage(ori_feature,plag_feature)

percentage of plagiarism found 73.00380228136882

The copied concepts are
['hiss', 'populace', 'meter', 'authorities', 'wealth', 'clip', 'public', 'animal', 'prison_term', 'humankind', 'household', 'recur', 'sentence', 'impeccable', 'government', 'recourse', 'earthly_concern', 'razzing', 'repair', '1000000', 'snort', 'Panthera_leo', 'human_beings', 'granger', 'introduction', 'computer_memory', 'class', 'initiation', 'spick-and-span', 'national', 'house', 'screen', 'government_activity', 'husbandman', 'million', 'Bronx_cheer', 'stack_away', 'conception', 'man', 'politics', 'dame', 'creature', 'fall_back', 'chick', 'place', 'mankind', 'shuttle', 'creative_activity', 'global', 'Fannie_Farmer', 'farmer', 'bird', 'spick', 'haunt', 'fowl', 'wealthiness', 'habitation', 'existence', 'cosmos', 'boo', 'multiplication', 'form', 'razz', 'fourth_dimension', 'menage', 'wight', 'origination', 'human_race', 'administration', 'domicile', 'worldly_concern', 'macrocosm', 'fund', 'lion', 'shop', 'time', '

In [16]:
return_plag_section(ori_feature,plag_feature,30)

plagiarism percentage of each sentence from start
  21.428571428571427
  45.13274336283185
  27.083333333333332
plagiarism percentage of each sentence from start
  46.7005076142132
  57.291666666666664
The most copied sentences are [2]
