In [5]:
# import libraries
import pandas as pd
import numpy as np
import os
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
csv_file = 'data/file_information.csv'
plagiarism_df = pd.read_csv(csv_file)

plagiarism_df.head()

Unnamed: 0,File,Task,Category
0,g0pA_taska.txt,a,non
1,g0pA_taskb.txt,b,cut
2,g0pA_taskc.txt,c,light
3,g0pA_taskd.txt,d,heavy
4,g0pA_taske.txt,e,non


* Convert all `Category` labels to numerical labels according to the following rules (a higher value indicates a higher degree of plagiarism):
    * 0 = `non`
    * 1 = `heavy`
    * 2 = `light`
    * 3 = `cut`
    * -1 = `orig`, this is a special value that indicates an original file.
* For the new `Class` column
    * Any answer text that is not plagiarized (`non`) should have the class label `0`. 
    * Any plagiarized answer texts should have the class label `1`. 
    * And any `orig` texts will have a special label `-1`. 


In [3]:
# Read in a csv file and return a transformed dataframe
def numerical_dataframe(csv_file='data/file_information.csv'):
    # read in csv
    df = pd.read_csv(csv_file)
    
    # replace string category with numerical category
    numerical_categories = {'non': 0, 'heavy': 1, 'light': 2, 'cut': 3, 'orig': -1}
    
    for key, val in numerical_categories.items():
        df = df.replace(key, val)
        
        
    # add class labels 
    df['Class'] = np.where(df['Category'] != 0, 1, 0)               # label whether plagiarized (1) or not (0)
    df['Class'] = np.where(df['Category'] == -1, -1, df['Class'])   # -1 category is origin => class is -1 (meaningless)
    
    return df

In [4]:
# create new `transformed_df`
transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')

# check that all categories of plagiarism have a class label = 1
transformed_df.head(10)

Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0
5,g0pB_taska.txt,a,0,0
6,g0pB_taskb.txt,b,0,0
7,g0pB_taskc.txt,c,3,1
8,g0pB_taskd.txt,d,2,1
9,g0pB_taske.txt,e,1,1


In [5]:
import helpers 

# create a text column 
text_df = helpers.create_text_column(transformed_df)
text_df.head()

Unnamed: 0,File,Task,Category,Class,Text
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...
1,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...
2,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...
3,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...
4,g0pA_taske.txt,e,0,0,dynamic programming is an algorithm design tec...


In [6]:
row_idx = 9 

sample_text = text_df.iloc[0]['Text']

print('Sample processed text:\n\n', sample_text)

Sample processed text:

 inheritance is a basic concept of object oriented programming where the basic idea is to create new classes that add extra detail to existing classes this is done by allowing the new classes to reuse the methods and variables of the existing classes and new methods and classes are added to specialise the new class inheritance models the is kind of relationship between entities or objects  for example postgraduates and undergraduates are both kinds of student this kind of relationship can be visualised as a tree structure where student would be the more general root node and both postgraduate and undergraduate would be more specialised extensions of the student node or the child nodes  in this relationship student would be known as the superclass or parent class whereas  postgraduate would be known as the subclass or child class because the postgraduate class extends the student class  inheritance can occur on several layers where if visualised would display a l

## Split data into training and test sets

The next cell will add a `Datatype` column to a given DataFrame to indicate if the record is: 
* `train` - Training data, for model training.
* `test` - Testing data, for model evaluation.
* `orig` - The task's original answer from wikipedia.


In [7]:
random_seed = 1

import helpers

# pass in `text_df` from above to create a complete dataframe, with all the information you need
complete_df = helpers.train_test_dataframe(text_df, random_seed=random_seed)

# check results
complete_df[90:100]

Unnamed: 0,File,Task,Category,Class,Text,Datatype
90,g4pE_taska.txt,a,1,1,object oriented programming is a style of prog...,train
91,g4pE_taskb.txt,b,2,1,pagerankalgorithm is also known as link analys...,train
92,g4pE_taskc.txt,c,3,1,the definition of term depends on the applicat...,train
93,g4pE_taskd.txt,d,0,0,bayes theorem or bayes rule or something cal...,train
94,g4pE_taske.txt,e,0,0,dynamic programming is a method for efficient...,test
95,orig_taska.txt,a,-1,-1,in object oriented programming inheritance is ...,orig
96,orig_taskb.txt,b,-1,-1,pagerank is a link analysis algorithm used by ...,orig
97,orig_taskc.txt,c,-1,-1,vector space model or term vector model is an ...,orig
98,orig_taskd.txt,d,-1,-1,in probability theory bayes theorem often call...,orig
99,orig_taske.txt,e,-1,-1,in mathematics and computer science dynamic pr...,orig


# Determining Plagiarism

### Containment calculation

>$$ \frac{\sum{count(\text{ngram}_{A}) \cap count(\text{ngram}_{S})}}{\sum{count(\text{ngram}_{A})}} $$
    

In [8]:
# helper functionss
def get_answer_and_source_text(df, answer_filename):    
    # find the answer text and its task
    answer_row = df[df['File'] == answer_filename]
    answer_text = answer_row.iloc[0]['Text']
    task = answer_row.iloc[0]['Task']

    # used to help find corresponding source text
    task_bool = df['Task'] == task
    category_bool = df['Category'] == -1
    
    source_row = df[task_bool & category_bool]
    source_text = source_row.iloc[0]['Text']
    
    return answer_text, source_text


In [9]:
def get_vocab_and_counts(answer_text, source_text, n):
    counts_ngram = CountVectorizer(analyzer='word', ngram_range=(n,n))
    vocab = counts_ngram.fit([answer_text, source_text]).vocabulary_
    counts = counts_ngram.fit_transform([answer_text, source_text])
    
    return vocab, counts.toarray()

In [10]:
def calculate_containment(df, n, answer_filename):
        
    # get answer and source text
    answer_text, source_text = get_answer_and_source_text(df, answer_filename)
    
    # create vocabulary and then count the occurences of each ngram
    vocab, ngram_counts = get_vocab_and_counts(answer_text, source_text, n)
    
    # calculate containment 
    
    intersection_list = np.amin(ngram_counts, axis = 0) # intersection of counts, taking min column-wise

    intersection = np.sum(intersection_list) # summing the intersection count
    count_ngram_A = np.sum(ngram_counts[0]) # normalizer

    return intersection / count_ngram_A

### Test cell

In [11]:
# select a value for n
n = 3

test_indices = range(5)

category_vals = []
containment_vals = []
for i in test_indices:
    # get level of plagiarism for a given file index
    category_vals.append(complete_df.loc[i, 'Category'])
    # calculate containment for given file and n
    filename = complete_df.loc[i, 'File']
    c = calculate_containment(complete_df, n, filename)
    containment_vals.append(c)

print('Original category values: \n', category_vals)
print()
print(str(n)+'-gram containment values: \n', containment_vals)

Original category values: 
 [0, 3, 2, 1, 0]

3-gram containment values: 
 [0.009345794392523364, 0.9641025641025641, 0.6136363636363636, 0.15675675675675677, 0.031746031746031744]


## Calculate LCS (Long Common Subsequence) with Dynamic Programming

In [12]:
# Compute the normalized LCS given an answer text and a source text
def lcs_norm_word(answer_text, source_text):
    # separate into list entries to simulate matrix
    answer_words = answer_text.split()
    source_words = source_text.split()
    
    n = len(answer_words)
    m = len(source_words)
    
    
    lcs_matrix = np.zeros((n+1, m+1))
    
    # iterate thru words, finding longest common subsequence using dynamic programming
    i = j = 1
    for answer_word in answer_words:
        j = 1
        for source_word in source_words:
            if answer_word == source_word:
                lcs_matrix[i][j] = lcs_matrix[i-1][j-1] + 1
            else:
                lcs_matrix[i][j] = max(lcs_matrix[i][j-1], lcs_matrix[i-1][j]) 
            
            j += 1
        
        i += 1
        
        
    lcs_normalized = lcs_matrix[n][m] / n
    
    return lcs_normalized
            

### Test cell

In [13]:
# test on your own
test_indices = range(5) # look at first few files

category_vals = []
lcs_norm_vals = []
# iterate through first few docs and calculate LCS
for i in test_indices:
    category_vals.append(complete_df.loc[i, 'Category'])
    # get texts to compare
    answer_text = complete_df.loc[i, 'Text'] 
    task = complete_df.loc[i, 'Task']
    # we know that source texts have Class = -1
    orig_rows = complete_df[(complete_df['Class'] == -1)]
    orig_row = orig_rows[(orig_rows['Task'] == task)]
    source_text = orig_row['Text'].values[0]
    
    # calculate lcs
    lcs_val = lcs_norm_word(answer_text, source_text)
    lcs_norm_vals.append(lcs_val)

# print out result, does it make sense?
print('Original category values: \n', category_vals)
print()
print('Normalized LCS values: \n', lcs_norm_vals)

Original category values: 
 [0, 3, 2, 1, 0]

Normalized LCS values: 
 [0.1917808219178082, 0.8207547169811321, 0.8464912280701754, 0.3160621761658031, 0.24257425742574257]


## Containment features

In [14]:
def create_containment_features(df, n, column_name=None):
    
    containment_values = []
    
    if(column_name==None):
        column_name = 'c_'+str(n) # c_1, c_2, .. c_n
    
    # iterates through dataframe rows
    for i in df.index:
        file = df.loc[i, 'File']
        # Computes features using calculate_containment function
        if df.loc[i,'Category'] > -1:
            c = calculate_containment(df, n, file)
            containment_values.append(c)
        # Sets value to -1 for original tasks 
        else:
            containment_values.append(-1)
    
    print(str(n)+'-gram containment features created!')
    return containment_values


## LCS features

In [15]:
def create_lcs_features(df, column_name='lcs_word'):
    
    lcs_values = []
    
    # iterate through files in dataframe
    for i in df.index:
        # Computes LCS_norm words feature using function above for answer tasks
        if df.loc[i,'Category'] > -1:
            # get texts to compare
            answer_text = df.loc[i, 'Text'] 
            task = df.loc[i, 'Task']
            # we know that source texts have Class = -1
            orig_rows = df[(df['Class'] == -1)]
            orig_row = orig_rows[(orig_rows['Task'] == task)]
            source_text = orig_row['Text'].values[0]

            # calculate lcs
            lcs = lcs_norm_word(answer_text, source_text)
            lcs_values.append(lcs)
        # Sets to -1 for original tasks 
        else:
            lcs_values.append(-1)

    print('LCS features created!')
    return lcs_values
    

## Extract Containment and LCS features

In [16]:
# Define an ngram range
ngram_range = range(1,8)


features_list = []

# Create features in a features_df
all_features = np.zeros((len(ngram_range)+1, len(complete_df)))

# Calculate features for containment for ngrams in range
i=0
for n in ngram_range:
    column_name = 'c_'+str(n)
    features_list.append(column_name)
    # create containment features
    all_features[i]=np.squeeze(create_containment_features(complete_df, n))
    i+=1

# Calculate features for LCS_Norm Words 
features_list.append('lcs_word')
all_features[i]= np.squeeze(create_lcs_features(complete_df))

# create a features dataframe
features_df = pd.DataFrame(np.transpose(all_features), columns=features_list)

# Print all features/columns
print()
print('Features: ', features_list)
print()

1-gram containment features created!
2-gram containment features created!
3-gram containment features created!
4-gram containment features created!
5-gram containment features created!
6-gram containment features created!
7-gram containment features created!
LCS features created!

Features:  ['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7', 'lcs_word']



In [17]:
# print some results 
features_df.head(100)

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,lcs_word
0,0.398148,0.079070,0.009346,0.000000,0.000000,0.000000,0.000000,0.191781
1,1.000000,0.984694,0.964103,0.943299,0.922280,0.901042,0.879581,0.820755
2,0.869369,0.719457,0.613636,0.515982,0.449541,0.382488,0.319444,0.846491
3,0.593583,0.268817,0.156757,0.108696,0.081967,0.060440,0.044199,0.316062
4,0.544503,0.115789,0.031746,0.005319,0.000000,0.000000,0.000000,0.242574
...,...,...,...,...,...,...,...,...
95,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
96,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
97,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
98,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000


## Correlated Features

Many classification models, for example a Naive Bayes classifier, rely on the assumption that features are *not* highly correlated; highly-correlated features may over-inflate the importance of a single feature. 

In [18]:
# Create correlation matrix for just Features to determine different models to test
corr_matrix = features_df.corr().abs().round(2)

# display shows all of a dataframe
display(corr_matrix)

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,lcs_word
c_1,1.0,0.94,0.9,0.89,0.88,0.87,0.87,0.97
c_2,0.94,1.0,0.99,0.98,0.97,0.96,0.95,0.98
c_3,0.9,0.99,1.0,1.0,0.99,0.98,0.98,0.97
c_4,0.89,0.98,1.0,1.0,1.0,0.99,0.99,0.95
c_5,0.88,0.97,0.99,1.0,1.0,1.0,1.0,0.95
c_6,0.87,0.96,0.98,0.99,1.0,1.0,1.0,0.94
c_7,0.87,0.95,0.98,0.99,1.0,1.0,1.0,0.93
lcs_word,0.97,0.98,0.97,0.95,0.95,0.94,0.93,1.0


We can see that the c_1 (Containment with 1-gram) and c_6 (Containment with 6-grams) are the less correlated features, so we will be choosing them in our model beside LCS features.

## Create train-test data

In [19]:
def train_test_data(complete_df, features_df, selected_features):
    selected_features_df = features_df[selected_features]
    
    # get the training features
    train_x = np.array(selected_features_df[complete_df['Datatype'] == 'train'])
    # And training class labels (0 or 1)
    train_y = np.array(complete_df[complete_df['Datatype'] == 'train']['Class'])
    
    # get the test features and labels
    test_x = np.array(selected_features_df[complete_df['Datatype'] == 'test'])
    test_y = np.array(complete_df[complete_df['Datatype'] == 'test']['Class'])
    
    return (train_x, train_y), (test_x, test_y)
    

In [20]:
# Select your list of features, this should be column names from features_df
# ex. ['c_1', 'lcs_word']
selected_features = ['c_1', 'c_6', 'lcs_word']


(train_x, train_y), (test_x, test_y) = train_test_data(complete_df, features_df, selected_features)

# check that division of samples seems correct
print('Training size: ', len(train_x))
print('Test size: ', len(test_x))
print()
print('Training df sample: \n', train_x[:10])

Training size:  70
Test size:  25

Training df sample: 
 [[0.39814815 0.         0.19178082]
 [0.86936937 0.38248848 0.84649123]
 [0.59358289 0.06043956 0.31606218]
 [0.54450262 0.         0.24257426]
 [0.32950192 0.         0.16117216]
 [0.59030837 0.         0.30165289]
 [0.75977654 0.1954023  0.48430493]
 [0.51612903 0.         0.27083333]
 [0.44086022 0.         0.22395833]
 [0.97945205 0.74468085 0.9       ]]


---
## Creating Final Data Files


In [21]:
def make_csv(x, y, filename, data_dir):
    # make data dir, if it does not exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    
    # combine data and sent to csv
    pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1) \
        .to_csv(os.path.join(data_dir, filename), header=False, index=False)
    
    # nothing is returned, but a print statement indicates that the function has run
    print('Path created: '+str(data_dir)+'/'+str(filename))

In [22]:
# can change directory, if you want
data_dir = 'plagiarism_data'


make_csv(train_x, train_y, filename='train.csv', data_dir=data_dir)
make_csv(test_x, test_y, filename='test.csv', data_dir=data_dir)

Path created: plagiarism_data/train.csv
Path created: plagiarism_data/test.csv
