# imports

In [None]:
# ! pip install nltk
# import nltk
# nltk.download('punkt_tab')
# ! pip install -U spacy
# ! python -m spacy download en
# ! python -m spacy download en_core_web_md

In [None]:
from collections import Counter
import math
import heapq
import string
translator = str.maketrans('', '', string.punctuation)

import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
sent_detector = PunktSentenceTokenizer()

import spacy
nlp = spacy.load("en_core_web_md")  # make sure to use larger package!

# load in essays

In [None]:
V = set()
human_V = set()
human_word_count = 0
human_sentences = []
human_vectors = []

llm_V = set()
llm_word_count = 0
llm_sentences =[]
llm_vectors =[]

with open('./human.txt') as f:
    lines = []
    for line in f:
        lines.append(line)
    text = ' \n'.join(lines)
    
    for sentence in sent_detector.tokenize(text.strip()):
        clean = sentence.lower().translate(translator)
        human_sentences.append(clean)
        words = word_tokenize(clean)
        human_word_count += len(words)
        for word in words:
            human_V.add(word)
    f.close()

with open('./llm.txt') as f:
    lines = []
    for line in f:
        lines.append(line)
    text = ' \n'.join(lines)
    
    for sentence in sent_detector.tokenize(text.strip()):
        clean = sentence.lower().translate(translator)
        llm_sentences.append(clean)
        words = word_tokenize(clean)
        llm_word_count += len(words)
        for word in words:
            llm_V.add(word)
    f.close()

V = human_V.union(llm_V)

# Full text of essays
ft_human = ' '.join(human_sentences)
ft_llm = ' '.join(llm_sentences)

# word usage stats

In [None]:
s = "Human wrote {t_h} words in total, with {u_h} unique.\n\nLLM wrote {t_l} words in total, with {u_l} unique."
print(s.format(t_h=human_word_count, u_h=len(human_V), t_l=llm_word_count, u_l=len(llm_V)))

counter = Counter(word_tokenize(ft_human))
h = []
for w in counter:
    heapq.heappush(h,(counter[w],w))

k=50
print("\n{k} least frequent words used in human essay:".format(k=k))
LEN = min(len(h),k)
for i in range(LEN):
    ans = heapq.heappop(h)
    print("\t{w}: {c} time(s)".format(w=ans[1],c=ans[0]))

counter = Counter(word_tokenize(ft_llm))
h = []
for w in counter:
    heapq.heappush(h,(counter[w],w))

print("\n{k} least frequent words used in LLM essay:".format(k=k))
LEN = min(len(h),k)
for i in range(LEN):
    ans = heapq.heappop(h)
    print("\t{w}: {c} time(s)".format(w=ans[1],c=ans[0]))


# functions to compute cosine similarity

In [None]:
def vectorize(sentence, V):
    v = []
    counter = Counter(word_tokenize(sentence))
    for w in V:
        if w in counter:
            v.append(1+math.log(counter[w]))
        else:
            v.append(0)
    return v

def norm(v):
    return sum(a*a for a in v)**0.5

def cosine(v1, v2):
    # Calculate dot product
    dot_product = sum(a*b for a, b in zip(v1, v2))

    # Compute cosine similarity
    cosine_similarity = dot_product / (norm(v1) * norm(v2))
    return cosine_similarity


# cosine between whole essays

In [None]:
c=cosine(vectorize(ft_human,V),vectorize(ft_llm,V))
print("cosine between essays: {c:.4f}".format(c=c))

# up to 5 most similar sentences between essays

In [None]:
h = []
for i in range(len(human_sentences)):
    for j in range(len(llm_sentences)):
        sim = cosine(vectorize(human_sentences[i],V),vectorize(llm_sentences[j],V))
        heapq.heappush(h,(1-sim,[i,j]))

LEN = min(len(h),5)
for i in range(LEN):
    ans = heapq.heappop(h)
    s = "cosine: {s:.4f}\nHuman: {h}\nLLM: {l}\n"
    print(s.format(s=1-ans[0],h=human_sentences[ans[1][0]],l=llm_sentences[ans[1][1]]))

# SpaCy vector similarity

In [None]:
doc1 = nlp(ft_human)
doc2 = nlp(ft_llm)
print("SpaCy similarity between essays: {c:.4f}".format(c=c))

In [None]:
h = []
for i in range(len(human_sentences)):
    for j in range(len(llm_sentences)):
        doc1 = nlp(human_sentences[i])
        doc2 = nlp(llm_sentences[j])
        sim = doc1.similarity(doc2)
        heapq.heappush(h,(1-sim,[i,j]))

LEN = min(len(h),5)
for i in range(LEN):
    ans = heapq.heappop(h)
    s = "SpaCy sim: {s:.4f}\nHuman: {h}\nLLM: {l}\n"
    print(s.format(s=1-ans[0],h=human_sentences[ans[1][0]],l=llm_sentences[ans[1][1]]))
