In [1]:
# import libraries
import pandas as pd
import numpy as np
import os
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt

In [2]:
def get_vocab_and_counts(source_text , answer_text, n):
    counts_ngram = CountVectorizer(analyzer='word', ngram_range=(n,n))
    vocab = counts_ngram.fit([answer_text, source_text]).vocabulary_
    counts = counts_ngram.fit_transform([answer_text, source_text])
    
    return vocab, counts.toarray()

In [3]:
def calculate_containment(n,source_text , answer_text):

    vocab, ngram_counts = get_vocab_and_counts(answer_text = answer_text, source_text = source_text, n = n)
    
    intersection_list = np.amin(ngram_counts, axis = 0) # intersection of counts, taking min column-wise

    intersection = np.sum(intersection_list) # summing the intersection count
    count_ngram_A = np.sum(ngram_counts[0]) # normalizer

    return intersection / count_ngram_A

In [4]:
def lcs_norm_word(source_text , answer_text):
    
    # separate into list entries to simulate matrix
    answer_words = answer_text.split()
    source_words = source_text.split()
    
    n = len(answer_words)
    m = len(source_words)
    
    
    lcs_matrix = np.zeros((n+1, m+1))
    
    # iterate thru words, finding longest common subsequence using dynamic programming
    i = j = 1
    for answer_word in answer_words:
        j = 1
        for source_word in source_words:
            if answer_word == source_word:
                lcs_matrix[i][j] = lcs_matrix[i-1][j-1] + 1
            else:
                lcs_matrix[i][j] = max(lcs_matrix[i][j-1], lcs_matrix[i-1][j]) 
            
            j += 1
        
        i += 1
        
        
    lcs_normalized = lcs_matrix[n][m] / n
    
    return lcs_normalized
            

In [5]:
def create_containment_features( n, source_text , answer_text, column_name=None):
    
    containment_values = []
    
    if(column_name==None):
        column_name = 'c_'+str(n) # c_1, c_2, .. c_n
    
    c = calculate_containment( n, answer_text=answer_text, source_text = source_text)
    containment_values.append(c)
    print(str(n)+'-gram containment features created!')
    return containment_values


In [6]:
def create_lcs_features(source_text , answer_text):
    
    lcs_values = []
    lcs = lcs_norm_word(answer_text = answer_text, source_text=source_text)
    lcs_values.append(lcs)
    print('LCS features created!')
    return lcs_values

## Extract Containment and LCS features

In [7]:
def extract_containment_and_lcs_features(source_text, answer_text):

    # Define an ngram range
    ngram_range = range(1,8)

    features_list = []

    # Create features in a features_df
    all_features = np.zeros((len(ngram_range)+1, 1))

    # Calculate features for containment for ngrams in range
    i=0
    for n in ngram_range:
        column_name = 'c_'+str(n)
        features_list.append(column_name)
        # create containment features
        all_features[i]=np.squeeze(create_containment_features(source_text=source_text, answer_text=answer_text ,n=n))
        i+=1

    # Calculate features for LCS_Norm Words 
    features_list.append('lcs_word')

    all_features[i]= np.squeeze(create_lcs_features(answer_text=answer_text , source_text=source_text))

    # create a features dataframe
    features_df = pd.DataFrame(np.transpose(all_features), columns=features_list)

    # Print all features/columns
    print()
    print('Features: ', features_list)
    print()

    return features_df

## Load saved model

In [8]:
from joblib import load
clf_en = load('pretrained model/saved_model_v1.joblib')

# Add Your Data

Test 1:

In [9]:
source_text_1 = """
In object-oriented programming, inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The inheritance concept was invented in 1967 for Simula.

The new classes, known as derived classes, take over (or inherit) attributes and behavior of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification.

Inheritance provides the support for representation by categorization in computer languages. Categorization is a powerful mechanism number of information processing, crucial to human learning by means of generalization (what is known about specific entities is applied to a wider group given a belongs relation can be established) and cognitive economy (less information needs to be stored about each specific entity, only its particularities).

Inheritance is also sometimes called generalization, because the is-a relationships represent a hierarchy between classes of objects. For instance, a "fruit" is a generalization of "apple", "orange", "mango" and many others. One can consider fruit to be an abstraction of apple, orange, etc. Conversely, since apples are fruit (i.e., an apple is-a fruit), apples may naturally inherit all the properties common to all fruit, such as being a fleshy container for the seed of a plant.

An advantage of inheritance is that modules with sufficiently similar interfaces can share a lot of code, reducing the complexity of the program. Inheritance therefore has another view, a dual, called polymorphism, which describes many pieces of code being controlled by shared control code.
Inheritance is typically accomplished either by overriding (replacing) one or more methods exposed by ancestor, or by adding new methods to those exposed by an ancestor.

Complex inheritance, or inheritance used within a design that is not sufficiently mature, may lead to the Yo-yo problem.
"""

In [10]:
answer_text_1 ="""
In object-oriented programming, inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The inheritance concept was invented in 1967 for Simula.

The new classeit) attributes and behavior of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification.

Inheritance prization in computer languages. Categorization is a powerful mechanism number of information processing, crucial to human learning by means of generalization (what is known about specific entities is applied to a wider group given a belongs relation can be established) and cognitive economy (less information needs to be stored about each specific entity, only its particularities).

Inheritance ishe is-a relationships represent a hierarchy between classes of objects. For instance, a "fruit" is a generalization of "apple", "orange", "mango" and many others. One can consider fruit to be an abstraction of apple, orange, etc. Conversely, since apples are fruit (i.e., an apple is-a fruit), apples may naturally inherit all the properties common to all fruit, such as being a fleshy container for the seed of a plant.

An advantage osimilar interfaces can share a lot of code, reducing the complexity of the program. Inheritance therefore has another view, a dual, called polymorphism, which describes many pieces of code being controlled by shared control code.
Inheritance is typically accomplished either by overriding (replacing) one or more methods exposed by ancestor, or by adding new methods to those exposed by an ancestor.

Complex inheritance, or inheritance used within a design that is not sufficiently mature, may lead to the Yo-yo problem.
"""

Test 2:

In [11]:
source_text_2 = """
Vector space model (or term vector model) is an algebraic model for representing text documents (and any objects, in general) as vectors of identifiers, such as, for example, index terms. It is used in information filtering, information retrieval, indexing and relevancy rankings. Its first use was in the SMART Information Retrieval System.
A document is represented as a vector. Each dimension corresponds to a separate term. If a term occurs in the document, its value in the vector is non-zero. Several different ways of computing these values, also known as (term) weights, have been developed. One of the best known schemes is tf-idf weighting (see the example below).
The definition of term depends on the application. Typically terms are single words, keywords, or longer phrases. If the words are chosen to be the terms, the dimensionality of the vector is the number of words in the vocabulary (the number of distinct words occurring in the corpus).
The vector space model has the following limitations:
   1. Long documents are poorly represented because they have poor similarity values (a small scalar product and a large dimensionality)
   2. Search keywords must precisely match document terms; word substrings might result in a "false positive match"
   3. Semantic sensitivity; documents with similar context but different term vocabulary won't be associated, resulting in a "false negative match".
   4. The order in which the terms appear in the document is lost in the vector space representation.

"""

In [12]:
answer_text_2 = """
An algebraic model for representing text documents and any objects in general is known by the name Vector space model. It represents these as vectors of identifiers, index terms are one illustration of these. The Vector Space model was first used in the SMART Information Retrieval System, and it is utilised variously in indexing, information filtering, indexing and information retrieval.

A document has representation as a vector. Every dimension is precisely related to a separate term. The way in which term is defined depends entirely on the application: typically ‘terms’ are either single words, keywords or longer phrases. The dimensionality of the vector is the number of words in the vocabulary, if it is the words that are chose to be the terms. So the same rule applies with keywords and indeed longer phrases.

If a term occurs in the document, its value in the vector is non-zero. Several different ways of computing these values, additionally known as (term) weights, have been developed. One of the most famous schemes is tf-idf weighting. 

"""

Test 3:

In [13]:
source_text_3 = "pagerank is a link analysis algorithm used by the google internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents"

In [14]:
answer_text_3 = "i think pagerank is a link analysis algorithm used by google that uses a system of weights attached to each element of a hyperlinked set of documents"

In [15]:
extracted_features_1 = extract_containment_and_lcs_features(source_text_1, answer_text_1)
extracted_features_2 = extract_containment_and_lcs_features(source_text_2, answer_text_2)
extracted_features_3 = extract_containment_and_lcs_features(source_text_3, answer_text_3)

1-gram containment features created!
2-gram containment features created!
3-gram containment features created!
4-gram containment features created!
5-gram containment features created!
6-gram containment features created!
7-gram containment features created!
LCS features created!

Features:  ['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7', 'lcs_word']

1-gram containment features created!
2-gram containment features created!
3-gram containment features created!
4-gram containment features created!
5-gram containment features created!
6-gram containment features created!
7-gram containment features created!
LCS features created!

Features:  ['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7', 'lcs_word']

1-gram containment features created!
2-gram containment features created!
3-gram containment features created!
4-gram containment features created!
5-gram containment features created!
6-gram containment features created!
7-gram containment features created!
LCS features created!

Featur

In [16]:
print(extracted_features_1)
# print(extracted_features_2)
# print(extracted_features_3)

        c_1       c_2       c_3       c_4       c_5       c_6       c_7  \
0  0.984906  0.969697  0.954373  0.938931  0.923372  0.907692  0.891892   

   lcs_word  
0  0.985455  


In [17]:
selected_features = ['c_1', 'c_6', 'lcs_word']  # select your own features if you like 

selected_features_df = extracted_features_1[selected_features]
# selected_features_df = extracted_features_2[selected_features]
# selected_features_df = extracted_features_3[selected_features]

print(selected_features_df)

        c_1       c_6  lcs_word
0  0.984906  0.907692  0.985455


## Test the model on your text

In [18]:
res = clf_en.predict(selected_features_df)
print("Good for you, It's not plagiarismed" if res==0 else "Your text is plagiarismed, I'll call 911")

Your text is plagiarismed, I'll call 911


