In [None]:
from nltk.tokenize import word_tokenize
import re
import nltk
import pandas as pd
from nltk.tokenize import PunktSentenceTokenizer
import json

In [None]:
def score1(text, pos, child=False):
  ''' Identifies pronoun case functor '''
  pronouns = ['she', 'her', 'herself', 'he', 'him', 'himself', 'they', 'them', 'themselves', 'we', 'us', 'ourselves', 'ourself' ]
  returnal = 0
  words = text.split(" ")
  pos_list = pos.split(" ")
  if "PRON" in pos_list:
    indices = [i for i, x in enumerate(pos_list) if x == "PRON"]
    for inx in indices:
      if words[inx] in pronouns:
        if child:
          returnal += 0
        else: returnal += 1

  return returnal


In [None]:
def score2(text, pos, child=False):
  ''' Identifies article functor '''
  returnal = 0
  articles = ["a", "the", "an"]
  for art in articles:
    if art in text.split(" "):
      if child:
        returnal += 5
      else: returnal += 2
  return returnal
  #Article

In [None]:
def score3(text, pos, child=False):
  ''' Identifies contractible copula functor '''
  returnal = 0
  words = text.split(" ")
  if "'s" in words:
    indices = [i for i, x in enumerate(words) if x == "'s"]
    for inx in indices:
      pos_list = pos.split(" ")
      if pos_list[inx] == "VERB":
        if child:
          returnal += 6
        else: returnal += 3

  return returnal
  #contractible copula

In [None]:
def score4(text, pos, child=False):
  ''' Identifies progressive functor '''
  returnal = 0
  if re.search('VERB', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'VERB']
    for inx in indices:
      inx_verb = text.split(" ")[inx]
      if inx_verb.endswith('ing'):
        if child:
          returnal += 2
        else: returnal += 4

  return returnal
  # -ing
  #V+ing

In [None]:
def score5(text, pos, child=False):
  ''' Identifies plural functor '''
  returnal = 0
  avoid = ['s', 'e', 'u'] # exclude -ss (pass, mess) -es (long plural) -us (octopus, bus)
  if re.search('NOUN', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'NOUN']
    for inx in indices:
      inx_noun = text.split(" ")[inx]
      if len(inx_noun) > 1:
        if inx_noun.endswith('s') and inx_noun[-2] not in avoid:
          if child:
            returnal += 1
          else: returnal += 5

  return returnal
  #plural
  #NP+pl

In [None]:
def score6(text, pos, child=False):
  ''' Identifies contractible auxiliary functor '''
  if re.search('VERB VERB', pos):
    auxes = re.findall("(('s)|('re)) \w*ing", text)
    if child:
      return (9*len(auxes))
    return (6*len(auxes))

  return 0
  #contractible auxiliary
  # -be-V+ing
  # only triggered once

In [None]:
def score7(text, pos, child=False):
  ''' Identifies past regular functor '''
  returnal = 0
  if re.search('VERB', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'VERB']
    for inx in indices:
      inx_verb = text.split(" ")[inx]
      if inx_verb.endswith('ed'):
        if child:
          returnal += 3
        else: returnal += 7

  return returnal
  #past regular
  #NP/Pron - (have) - V+pst - NP/Pron

In [None]:
with open('irregular_verbs.txt') as infile:
  ''' Identifies past irregular functor '''
  irregular_verbs = infile.read().split("\n")

def score8(text, pos, child=False):
  returnal = 0

  if re.search('VERB', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'VERB']
    for inx in indices:
      inx_verb = text.split(" ")[inx]
      if inx_verb in irregular_verbs:
        if child:
          returnal += 4
        else: returnal += 8

  return returnal
  #past irregular
  #NP/pron - V+pst - NP/Pron

In [None]:
def score9(text, pos, child=False):
  ''' Identifies long plural functor '''
  returnal = 0
  if re.search('NOUN', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'NOUN']
    for inx in indices:
      inx_noun = text.split(" ")[inx]
      if inx_noun.endswith('es'):
        if child:
          returnal += 1
        else: returnal += 9

  return returnal
  #long plural
  #houses


In [None]:
def score10(text, pos, child=False):
  ''' Identifies possessive functor '''
  returnal = 0
  if re.search('NOUN', pos):
    pos_list = pos.split(" ")
    words = text.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'NOUN']
    for inx in indices:
      if inx != len(words) -1:
        if words[inx + 1] == "'s":
          if pos_list[inx + 1] != 'VERB':
            if child:
              returnal += 7
            else: returnal += 10

  return returnal

  #possessive 's
  #Det- (Adj) - N + poss-(N)

In [None]:
def score11(text, pos, child=False):
  ''' Identifies third person functor '''
  returnal = 0
  if re.search('VERB', pos):
    pos_list = pos.split(" ")
    words = text.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'VERB']
    for inx in indices:
      if words[inx].endswith("s") and (score8(words[inx], 'VERB') + score7(words[inx],'VERB')) == 0 and not words[inx].endswith("'s"):
        if child:
          returnal += 8
        else: returnal += 11

  return returnal
  #3rd person singular
  #NP/Pron+sing - V+tns - (Adv)

In [None]:
def total_score(sentence, pos_sentence, child=False):
  '''
  Calculates and returns total score by calling all scoring functions
  To be averaged over sentences or tokens
  '''

  score = 0

  score += score1(sentence.lower(), pos_sentence, child=child)
  score += score2(sentence.lower(), pos_sentence, child=child)
  score += score3(sentence.lower(), pos_sentence, child=child)
  score += score4(sentence.lower(), pos_sentence, child=child)
  score += score5(sentence.lower(), pos_sentence, child=child)
  score += score6(sentence.lower(), pos_sentence, child=child)
  score += score7(sentence.lower(), pos_sentence, child=child)
  score += score8(sentence.lower(), pos_sentence, child=child)
  score += score9(sentence.lower(), pos_sentence, child=child)
  score += score10(sentence.lower(), pos_sentence, child=child)
  score += score11(sentence.lower(), pos_sentence, child=child)

  return score

In [None]:
def diff_score(sequence, pos, sent_tokenizer, child=False):
  '''
  Calculates difficulty score for a sequence given sequence, pos and sentence_tokenizer trained on full dataset
  Returns sentence averaged difficulty score and token averaged difficulty score
  child=True if FLA
  '''

  #sentence tokenizer
  #chunk pos in equal lists as sentences
  sentences = sent_tokenizer.tokenize(sequence)
  lengths = []
  for sent in sentences:
    lengths.append(len(sent.split(" ")))

  splitpos = pos.split(" ")
  posses = []
  start = 0
  for i in lengths:
    posses.append(" ".join(splitpos[start:start + i]))
    start = i + start

  total = 0
  for sent, pos in zip(sentences, posses):
    if len(sent.split(" ")) == len(pos.split(" ")):
      total += total_score(sent, pos, child=child)
    else:
      print(sent, pos)

  diff_sent = total / len(sentences)
  diff_tokens = total / len(sequence.split(" "))

  return diff_sent, diff_tokens

In [None]:
def scoring(filepath, child=False):
  '''
  Takes file with sequences and scores sequences.
  Returns dataframe with sequence, pos, sentence averaged difficulty score, token averaged difficulty score
  child=True if FLA
  '''

  with open(filepath) as infile:
    full = infile.read()

  combined = full.split("\n")

  sent_tokenizer = PunktSentenceTokenizer(full)

  sequences = []
  posses = []

  for sentence in combined:
    sent = sentence.split("\t")
    if len(sent) > 1:
      sequences.append(sent[0])
      posses.append(sent[1])

  sent_diff_scores = []
  tokens_diff_scores = []
  for seq, pos in zip(sequences, posses):
    if seq == "":
      sent_diff_scores.append(0)
      tokens_diff_scores.append(0)
    else:
      diff_sent, diff_tokens = diff_score(seq, pos, sent_tokenizer, child=child)
      sent_diff_scores.append(diff_sent)
      tokens_diff_scores.append(diff_tokens)

# dictionary of lists
  dict = {'sequence' : sequences,
          'pos' : posses,
          'sentence_score': sent_diff_scores,
          'tokens_score': tokens_diff_scores}

  combined_df = pd.DataFrame(dict)
  combined_df.to_csv('scored_sequences_postagged.csv')

  return combined_df

In [None]:
def cleanup(ranking):
  """
  Turns tokenized sentences back into regular sentences
  """

  cleaned_seqs = []
  for seq in ranking['sequence']:
    seq = re.sub(r" ([!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~])", r"\1", seq)
    seq = re.sub(r" (n't)", r"\1", seq)
    cleaned_seqs.append(seq)

  ranking['sequence'] = cleaned_seqs
  return ranking

In [None]:
def ranking(filepath, score='sentence_score', ascending=True, child=False):
  """
  Takes scored sequences, ranks them and saves ranked sequences to file.
  score='token_score' if token averaged
  ascending=False if reversed
  child=True if FLA
  """

  scoring = pd.read_csv(filepath)
  scoring = scoring.dropna()

  ranking = scoring.sort_values(by=[score], ascending=ascending)
  ranking = cleanup(ranking)

  writefile = 'ranked_sequences_postagged'
  if not ascending:
    writefile += "_reversed"
  if score =='tokens_score':
    writefile += "_token"
  if child:
    writefile += "_child"

  writefile += ".csv"
  ranking.to_csv(writefile)
  return ranking

In [None]:
def random_curriculum(filepath):
  """
  Takes ranked sequences as input, randomly shuffles and saves to file
  """

  scoring = pd.read_csv(filepath)
  scoring = scoring.dropna()
  ranking = scoring.sample(frac = 1)
  ranking = cleanup(ranking)

  writefile = 'random_curriculum.csv'
  ranking.to_csv(writefile)
  return ranking

In [None]:
def filtered_curriculum(filepath):
  '''
  Takes ranked sequences and deleted any with a difficulty score of 0, saves to file
  '''

  sequences = pd.read_csv(filepath)
  filtered = sequences.loc[sequences['sentence_score'] != 0]
  newpath = "filtered_" + filepath
  filtered.to_csv(newpath)

  return filtered

In [None]:
def pipeline(child = False):
  ''' full pipeline for SLA datasets, child=True for FLA datasets '''

  scoring('clean_pred3', child=child)
  curriculum = ranking('scored_sequences.csv', child=child)
  reversed = ranking('ranked_sequences.csv', ascending=False, child=child)
  token = ranking('ranked_sequences.csv', score='tokens_score', child=child)
  random = random_curriculum('ranked_sequences.csv', child=child)
  filtered = filtered_curriculum('ranked_sequences.csv', child=child)