In [10]:
import spacy
import os
import numpy as np

nlp = spacy.load("en_core_web_sm")

In [2]:
subject_verb_disagreements = {
    "NN" : ["VBP"],
    "NNP" : ["VBP"],
    "PRP" : ["VBP", "VBZ"],
    "NNS" : ["VBZ"],
    "NNPS" : ["VBZ"]
}

In [3]:
def subject_verb_disagree(subject, verb):
    if subject.tag_ in subject_verb_disagreements.keys():
        if verb.tag_ in subject_verb_disagreements[subject.tag_]:
            return True
    return False
        

In [4]:
def count_subject_verb_errors(text):
    doc = nlp(text)
    errors = 0
    
    for sent in doc.sents:
        for token in sent:
            if "VB" in token.tag_:
                subject = None
                for child in token.children:
                    if child.dep_ in ("nsubj", "nsubjpass"):
                        subject = child
                        break
                if subject and subject_verb_disagree(subject, token):
                    errors += 1
    return errors

In [6]:
count_subject_verb_errors("Jessica have 8 years old")

1

In [8]:
mistakes_per_essay = []
for filename in os.listdir(os.path.join("essays_dataset", "essays")):
    with open(os.path.join("essays_dataset", "essays", filename)) as file:
        text = file.read()
        mistakes = count_subject_verb_errors(text)
        mistakes_per_essay.append(mistakes)

In [11]:
def general_scorer_gaussian_assumption(x, mean, stddev, min_score, max_score, reverse=False):
    z_score = (x - mean) / stddev
    z_min, z_max = -3, 3

    score = (z_score - z_min) / (z_max - z_min) * (max_score - min_score) + min_score
    if reverse:
        score = max_score - score + min_score
    return np.clip(score, min_score, max_score)

In [19]:
def score_by_subject_verb_disagreements(num_disagreements, mistakes_list, min_score, max_score):
    if num_disagreements < 2:
        return max_score
    mean = np.mean(mistakes_list)
    stddev = np.std(mistakes_list)
    return general_scorer_gaussian_assumption(num_disagreements, mean, stddev, min_score, max_score, reverse=True)

In [21]:
score_by_subject_verb_disagreements(2, mistakes_per_essay, 1, 5)

3.83083740592032