## The goal of this program is to further detect articles that grover deems as written by a human to check if it's an altered document with detecting it using a hash key function and comparing the values of the input and output documents and it will return whether it's altered or not

**Importing libraries**

In [2]:
import hashlib
from collections import defaultdict
import string
import numpy

**First we should be able to parse the text files whether its .txt or .pdf**

In [3]:
#function for parsing .txt file
def parse_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

**For pdf make sure to install PyPDF2 using: pip install PyPDF2**

In [4]:
#function for parsing .pdf file
import PyPDF2

def parse_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfFileReader(file)
        num_p = reader.numPages
        text = ""

        for page_num in range(num_p):
            page = reader.getPage(page_num)
            text += page.extractText()

    return text

**Using this if statement we will check if the input is a pdf or txt file**

In [6]:
def identifyFile (file):
    if file.lower().endswith('.txt'):
        return parse_txt(file) #parses txt 
    elif file.lower().endswith('.pdf'):
        return parse_pdf(file) #parses pdf

**Now that parsing functions are made we can preprocess the text**

In [7]:
def preprocess(text):
    # Implement your preprocessing function here
    preprocessed_text = text.lower()
    return preprocessed_text

In [8]:
#Generates hash key for each word in the text using SHA-256
def generate_hash_key(word):
    sha256 = hashlib.sha256()
    sha256.update(word.encode('utf-8'))
    return sha256.hexdigest()

In [9]:
#Creates and stores words and stores them into a hash_table
def create_dict(text):
    preprocessed_text = preprocess(text)
    words = preprocessed_text.split()
    hash_table = defaultdict(list)

    for word in words:
        hash_key = generate_hash_key(word)
        hash_table[hash_key].append(word)
    return hash_table

In [10]:
#runs the hash key function to compare old vs new text and if values are not the same returns false
def verify_doc(document, hash_table):
    preprocessed_document = preprocess(document)
    words = preprocessed_document.split()

    for word in words:
        hash_key = generate_hash_key(word)
        if hash_key not in hash_table:
            return False
    return True

**Now testing the function on articles to parse and detect if altered**

In [11]:
#This should output false

#nutrition article found online
input_path = "docs/article.txt"
#altered article using gpt that grover detected as written by human
output_path = "docs/alt_article.txt"

#parsing files
input_text = identifyFile(input_path)
document = identifyFile(output_path)

#creating hash key and storing for input and output and comparing it
hash_table = create_dict(input_text)
is_authentic = verify_doc(document, hash_table) #returns boolean

print("Is the document authentic?", is_authentic)

Is the document authentic? False


In [12]:
#This should output true

#Using the same document
input_path = "docs/article.txt"
output_path = "docs/article.txt"

#parsing files
input_text = identifyFile(input_path)
document = identifyFile(output_path)

#creating hash key and storing for input and output and comparing it
hash_table = create_dict(input_text)
is_authentic = verify_doc(document, hash_table) #returns boolean

print("Is the document authentic?", is_authentic)

Is the document authentic? True


**Now using a different article found online and altered with gpt**

In [16]:
#This should output false

#killer whales article found online
input_path = "docs/Article_02"
#altered article using gpt that grover detected as written by human
output_path = "docs/alt_Article02"

#parsing files not using identify file for this since isn't marked as .txt
input_text = parse_txt(input_path)
document = parse_txt(output_path)

#creating hash key and storing for input and output and comparing it
hash_table = create_dict(input_text)
is_authentic = verify_doc(document, hash_table) #returns boolean

print("Is the document authentic?", is_authentic)

Is the document authentic? False
