In [21]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
def get_topical_part(prompt):
    sentences = nltk.sent_tokenize(prompt)
    if len(sentences) < 3:
        return prompt  
    topic = sentences[1]
    return topic

def get_similarity(text1, text2):
    text1 = ' '.join([word for word in text1.split() if word.lower() not in nltk.corpus.stopwords.words('english')])
    text2 = ' '.join([word for word in text2.split() if word.lower() not in nltk.corpus.stopwords.words('english')])
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([text1, text2])
    similarity = (tfidf_matrix * tfidf_matrix.T)[0,1]

    # Assign a score based on the similarity
    if similarity < 0.1:
        score = 1
    elif similarity < 0.2:
        score = 2
    elif similarity < 0.4:
        score = 3
    elif similarity < 0.6:
        score = 4
    else:
        score = 5

    return score

In [3]:


# Load SpaCy's English model with word vectors
nlp = spacy.load('en_core_web_sm')

def tokenize_and_vectorize(inp_str):
    # Tokenize the text
    tokens = nltk.word_tokenize(inp_str)

    # Get the vector representation of each word
    word_vectors = [nlp(word).vector for word in tokens]

    return word_vectors

def sent_embedding(user_input):
    # Write your code here:
    vectors = tokenize_and_vectorize(user_input)
    embedding = np.zeros(len(vectors[0]), )
    for vector in vectors:
        embedding += vector
    embedding /= len(vectors)

    return embedding

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    sim = 0.0

    # Copy your cosine_similarity code here
    numerator = np.dot(a, b)
    denominator = np.linalg.norm(a) * np.linalg.norm(b)
    if numerator == 0:
        return 0
    sim = numerator / denominator

    return sim

In [28]:
def get_essay_embedding(essay):
    sents = sent_tokenize(essay)
    embedding = np.zeros(sent_embedding(sents[0]).shape)
    for s in sents:
        embedding += sent_embedding(s)
    embedding /= len(sents)

    return embedding

In [23]:
df = pd.read_csv('essays_dataset/index.csv', sep=';')
df_low = df[df['grade'] == 'low']
df_high = df[df['grade'] == 'high']

df_low.reset_index(drop=True, inplace=True)
df_high.reset_index(drop=True, inplace=True)

In [24]:
low_similarities = []
for index, row in df_low.iterrows():
    prompt = get_topical_part(row['prompt'])
    with open('essays_dataset/essays/' + row['filename'], 'r') as file:
        essay = file.read()
    similarity = get_similarity(prompt, essay)
    low_similarities.append(similarity)

In [25]:
high_similarities = []
for index, row in df_high.iterrows():
    prompt = get_topical_part(row['prompt'])
    with open('essays_dataset/essays/' + row['filename'], 'r') as file:
        essay = file.read()
    similarity = get_similarity(prompt, essay)
    high_similarities.append(similarity)

In [26]:
print(np.mean(low_similarities))
print(np.mean(high_similarities))

2.86
3.04


In [27]:
wrong_similarities = []
for i in range(99):
    prompt = get_topical_part(df.loc[i+1,'prompt'])
    with open('essays_dataset/essays/' + df.loc[i,'filename'], 'r') as file:
        essay = file.read()
    similarity = get_similarity(prompt, essay)
    wrong_similarities.append(similarity)

In [28]:
np.mean(wrong_similarities)

1.3131313131313131