In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/babylm/content/training/datasets/

/content/drive/MyDrive/babylm/content/training/datasets


In [3]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import re
import nltk
import pandas as pd
from nltk.tokenize import PunktSentenceTokenizer
import json

In [4]:
def score1(text, pos, child=False):

  pronouns = ['she', 'her', 'herself', 'he', 'him', 'himself', 'they', 'them', 'themselves', 'we', 'us', 'ourselves', 'ourself' ]
  returnal = 0
  words = text.split(" ")
  pos_list = pos.split(" ")
  if "PRON" in pos_list:
    indices = [i for i, x in enumerate(pos_list) if x == "PRON"]
    for inx in indices:
      if words[inx] in pronouns:
        if child:
          returnal += 0
        else: returnal += 1

  return returnal
  #pronoun case
  # pron-(Aux)-(Neg)-V-(Pron)
  # you/it not considered as they are not influenced by case

In [5]:
def score2(text, pos, child=False):
  returnal = 0
  articles = ["a", "the", "an"]
  for art in articles:
    if art in text.split(" "):
      if child:
        returnal += 5
      else: returnal += 2
  return returnal
  #Article

In [6]:
def score3(text, pos, child=False):
  returnal = 0
  words = text.split(" ")
  if "'s" in words:
    indices = [i for i, x in enumerate(words) if x == "'s"]
    for inx in indices:
      pos_list = pos.split(" ")
      if pos_list[inx] == "VERB":
        if child:
          returnal += 6
        else: returnal += 3

  return returnal
  #contractible copula

In [7]:
def score4(text, pos, child=False):
  returnal = 0
  if re.search('VERB', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'VERB']
    for inx in indices:
      inx_verb = text.split(" ")[inx]
      if inx_verb.endswith('ing'):
        if child:
          returnal += 2
        else: returnal += 4

  return returnal
  # -ing
  #V+ing

In [26]:
def score5(text, pos, child=False):
  returnal = 0
  avoid = ['s', 'e', 'u'] # exclude -ss (pass, mess) -es (long plural) -us (octopus, bus)
  if re.search('NOUN', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'NOUN']
    for inx in indices:
      inx_noun = text.split(" ")[inx]
      if len(inx_noun) > 1:
        if inx_noun.endswith('s') and inx_noun[-2] not in avoid:
          if child:
            returnal += 1
          else: returnal += 5

  return returnal
  #plural
  #NP+pl

In [9]:
def score6(text, pos, child=False):
  if re.search('VERB VERB', pos):
    auxes = re.findall("(('s)|('re)) \w*ing", text)
    if child:
      return (9*len(auxes))
    return (6*len(auxes))

  return 0
  #contractible auxiliary
  # -be-V+ing
  # only triggered once

In [12]:
def score7(text, pos, child=False):
  returnal = 0
  if re.search('VERB', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'VERB']
    for inx in indices:
      inx_verb = text.split(" ")[inx]
      if inx_verb.endswith('ed'):
        if child:
          returnal += 3
        else: returnal += 7

  return returnal
  #past regular
  #NP/Pron - (have) - V+pst - NP/Pron

In [13]:
with open('irregular_verbs.txt') as infile:
  irregular_verbs = infile.read().split("\n")

def score8(text, pos, child=False):
  returnal = 0

  if re.search('VERB', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'VERB']
    for inx in indices:
      inx_verb = text.split(" ")[inx]
      if inx_verb in irregular_verbs:
        if child:
          returnal += 4
        else: returnal += 8

  return returnal
  #past irregular
  #NP/pron - V+pst - NP/Pron

In [14]:
def score9(text, pos, child=False):
  returnal = 0
  if re.search('NOUN', pos):
    pos_list = pos.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'NOUN']
    for inx in indices:
      inx_noun = text.split(" ")[inx]
      if inx_noun.endswith('es'):
        if child:
          returnal += 1
        else: returnal += 9

  return returnal
  #long plural
  #houses


In [15]:
def score10(text, pos, child=False):
  returnal = 0
  if re.search('NOUN', pos):
    pos_list = pos.split(" ")
    words = text.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'NOUN']
    for inx in indices:
      if inx != len(words) -1:
        if words[inx + 1] == "'s":
          if pos_list[inx + 1] != 'VERB':
            if child:
              returnal += 7
            else: returnal += 10

  return returnal

  #possessive 's
  #Det- (Adj) - N + poss-(N)

In [16]:
def score11(text, pos, child=False):
  returnal = 0
  if re.search('VERB', pos):
    pos_list = pos.split(" ")
    words = text.split(" ")
    indices = [i for i, x in enumerate(pos_list) if x == 'VERB']
    for inx in indices:
      if words[inx].endswith("s") and (score8(words[inx], 'VERB') + score7(words[inx],'VERB')) == 0 and not words[inx].endswith("'s"):
        if child:
          returnal += 8
        else: returnal += 11

  return returnal
  #3rd person singular
  #NP/Pron+sing - V+tns - (Adv)

In [17]:
def total_score(sentence, pos_sentence, child=False):
  '''
  Calculates and returns total score by calling all scoring functions
  To be averaged over sentences or tokens
  '''

  score = 0

  score += score1(sentence.lower(), pos_sentence, child=child)
  score += score2(sentence.lower(), pos_sentence, child=child)
  score += score3(sentence.lower(), pos_sentence, child=child)
  score += score4(sentence.lower(), pos_sentence, child=child)
  score += score5(sentence.lower(), pos_sentence, child=child)
  score += score6(sentence.lower(), pos_sentence, child=child)
  score += score7(sentence.lower(), pos_sentence, child=child)
  score += score8(sentence.lower(), pos_sentence, child=child)
  score += score9(sentence.lower(), pos_sentence, child=child)
  score += score10(sentence.lower(), pos_sentence, child=child)
  score += score11(sentence.lower(), pos_sentence, child=child)

  return score

In [18]:
# sanity checks

'''

sents = [("Many teenagers were helping themselves .", "ADT NOUN VERB VERB PRON ."), # 5 8 4 1 / 1 4 2 0
("A lot of actresses ' nieces have toured that art gallery .", "DET NOUN PREP NOUN . NOUN VERB VERB DET NOUN NOUN ."), #  2 9 10 9 7 / 5 1 7 1 3
("It 's herself who Karen criticized .", "DET VERB PRON PRON NOUN VERB ."), # 3 1 7 / 6 0 3
("Rachel was apt to talk to Alicia .", "NOUN VERB ADJ PREP VERB PREP NOUN ."), # 8 / 4
("That adult has brought that purple octopus .", "DET NOUN VERB VERB DET ADJ NOUN"), # 11 8 / 8 4
("Curtis 's boss discussed four sons and Andrew discussed five sick sons .", "NOUN ADT NOUN VERB ADJ NOUN ADT NOUN VERB ADJ ADJ NOUN ."), # 10 7 5 7 5 / 7 3 1 3 1
("Martin did find out what every cashier that should n't drink wore .", "NOUN VERB VERB PREP PRON ADJ NOUN ADT VERB NEG VERB VERB ."), # 8 8
("Edward hid the cats .", "NOUN VERB DET NOUN ."), #  8 2 5 / 4 5 1
("What could Alan discover he has run around .", "PRON VERB NOUN VERB PRON VERB VERB ADV ."), #11 1 / 8
("Those turtles that are boring April could not ever break those couches .", "DET NOUN ADT VERB VERB NOUN VERB NEG ADV VERB DET NOUN ."), # 4 9 9 / 2 1 1
("An actor arrived at at most six lakes .", "DET NOUN VERB PREP PREP ADV ADJ NOUN ."), # 7 2 9 / 5 1 3
("The dress crumples .", "DET NOUN VERB .")] # 2 11 / 5 8

sents_scores = [18, 37, 11, 8, 19, 34, 16, 15, 12, 22, 18, 13]
child_scores = [7, 17, 9, 4, 12, 15, 8, 10, 8, 4, 9, 13]

sents = [("Katherine ca n't help herself .", "NOUN VERB ADT VERB PRON ."), #1 / 0
("Amanda 's respected by some waitresses .", "NOUN VERB VERB PREP ADT NOUN ."), # 3 7 9 / 6 3 0
("A lot of actresses that thought about Alice healed themselves .", "DET NOUN PREP NOUN ADT VERB PREP NOUN VERB PRON ."), # 2 9 8 7 1 / 5 0 4 3 0
("William has declared there to be no guests getting fired .", "NOUN VERB VERB PREP PREP VERB ADT NOUN VERB VERB ."), # 7 7 4 11 5 / 3 3 2 8 1
("Craig explored that grocery store .","NOUN VERB ADT NOUN NOUN ."), # 7 / 3
("Brad passed one big museum and Amanda passed several .", "NOUN VERB ADT ADJ NOUN ADT NOUN VERB ADJ ."), # 7 7 / 3 3
("Joel discovered the vase that Patricia might take .", "NOUN VERB DET NOUN DET NOUN VERB VERB ."), # 7 2 / 3 5
("The forgotten newspaper article was bad .","DET ADJ NOUN NOUN VERB ADJ ."), # 8 2 / 4 5
("Who has Colleen aggravated before kissing Judy ?", "PRON VERB NOUN VERB PREP VERB NOUN ."), #11 7 4 / 8 3 2
("Should Monica ever grin ?", "VERB NOUN ADT VERB ."), # 0 / 0
("There are n't many lights darkening ?", "PREP VERB NEG ADJ NOUN VERB ."), # 4 5 / 2 1
("A sketch of lights does n't appear .", "DET NOUN PREP NOUN VERB NEG VERB .")] # 2 5 11 / 5 1 8

sents_scores = [1, 19, 27, 34, 7, 14, 9, 10, 22, 0, 9, 18]
child_scores = [0, 11, 12, 17, 3, 6, 8, 9, 13, 0, 3, 14]

sents = [("Jeffrey 's sons are insulted by Tina 's supervisor .", "NOUN ADT NOUN VERB VERB PREP NOUN ADT NOUN ."), # 10 5 7 10 / 7 1 3 7
("Nancy could say every guy hides himself .", "NOUN VERB VERB ADJ NOUN VERB PRON ."), # 1 11 / 0 8
("There was bound to be a fish escaping .", "PREP VERB VERB PREP VERB DET NOUN VERB"), # 8 8 4 2 / 4 4 2 5
("Those ladies walk through those oases .", "DET NOUN VERB PREP DET NOUN ."), # 9 9 / 1 1
("Bruce knows that person that Dawn likes that argued about a lot of guys .", "NOUN VERB DET NOUN ADT NOUN VERB ADT VERB PREP DET NOUN PREP NOUN ."), # 11 11 7 2 5 / 8 8 3 5 1
("Who have many women 's touring Spain embarrassed .", "PRON VERB ADJ NOUN ADT VERB NOUN VERB ."), # 10 4 7 / 7 2 3
("Even these trucks have often slowed .", "ADV DET NOUN VERB ADV VERB ."), # 5 7 / 1 3
("Each book 's there disturbing Margaret .", "ADJ NOUN VERB ADT VERB NOUN ."), # 6 4 / 9 2
("This goose is n't bothering him .", "DET NOUN VERB NEG VERB NOUN .")] # 11 4 / 8 2


sents_scores = [32, 12, 22, 18, 36, 21, 12, 10, 15]
child_scores = [18, 8, 15, 2, 25, 12, 4, 11, 10 ]

for (sent, pos), score in zip(sents, child_scores):
  calc = total_score(sent, pos, child=True)
  if score != calc:
    print(sent)
    print(score)
    print(calc)

'''

'\n\nsents = [("Many teenagers were helping themselves .", "ADT NOUN VERB VERB PRON ."), # 5 8 4 1 / 1 4 2 0\n("A lot of actresses \' nieces have toured that art gallery .", "DET NOUN PREP NOUN . NOUN VERB VERB DET NOUN NOUN ."), #  2 9 10 9 7 / 5 1 7 1 3\n("It \'s herself who Karen criticized .", "DET VERB PRON PRON NOUN VERB ."), # 3 1 7 / 6 0 3\n("Rachel was apt to talk to Alicia .", "NOUN VERB ADJ PREP VERB PREP NOUN ."), # 8 / 4\n("That adult has brought that purple octopus .", "DET NOUN VERB VERB DET ADJ NOUN"), # 11 8 / 8 4\n("Curtis \'s boss discussed four sons and Andrew discussed five sick sons .", "NOUN ADT NOUN VERB ADJ NOUN ADT NOUN VERB ADJ ADJ NOUN ."), # 10 7 5 7 5 / 7 3 1 3 1\n("Martin did find out what every cashier that should n\'t drink wore .", "NOUN VERB VERB PREP PRON ADJ NOUN ADT VERB NEG VERB VERB ."), # 8 8\n("Edward hid the cats .", "NOUN VERB DET NOUN ."), #  8 2 5 / 4 5 1\n("What could Alan discover he has run around .", "PRON VERB NOUN VERB PRON VERB VERB 

In [19]:
def diff_score(sequence, pos, sent_tokenizer, child=False):
  '''
  Calculates difficulty score for a sequence given sequence, pos and sentence_tokenizer trained on full dataset
  Returns sentence averaged difficulty score and token averaged difficulty score
  child=True if FLA
  '''

  #sentence tokenizer
  #chunk pos in equal lists as sentences
  sentences = sent_tokenizer.tokenize(sequence)
  lengths = []
  for sent in sentences:
    lengths.append(len(sent.split(" ")))

  splitpos = pos.split(" ")
  posses = []
  start = 0
  for i in lengths:
    posses.append(" ".join(splitpos[start:start + i]))
    start = i + start

  total = 0
  for sent, pos in zip(sentences, posses):
    if len(sent.split(" ")) == len(pos.split(" ")):
      total += total_score(sent, pos, child=child)
    else:
      print(sent, pos)

  diff_sent = total / len(sentences)
  diff_tokens = total / len(sequence.split(" "))

  return diff_sent, diff_tokens

In [31]:
def scoring(filepath, child=False):
  '''
  Takes file with sequences and scores sequences.
  Returns dataframe with sequence, pos, sentence averaged difficulty score, token averaged difficulty score
  child=True if FLA
  '''

  with open(filepath) as infile:
    full = infile.read()

  combined = full.split("\n")

  sent_tokenizer = PunktSentenceTokenizer(full)

  sequences = []
  posses = []

  for sentence in combined:
    sent = sentence.split("\t")
    if len(sent) > 1:
      sequences.append(sent[0])
      posses.append(sent[1])

  sent_diff_scores = []
  tokens_diff_scores = []
  for seq, pos in zip(sequences, posses):
    if seq == "":
      sent_diff_scores.append(0)
      tokens_diff_scores.append(0)
    else:
      diff_sent, diff_tokens = diff_score(seq, pos, sent_tokenizer, child=child)
      sent_diff_scores.append(diff_sent)
      tokens_diff_scores.append(diff_tokens)

# dictionary of lists
  dict = {'sequence' : sequences,
          'pos' : posses,
          'sentence_score': sent_diff_scores,
          'tokens_score': tokens_diff_scores}

  combined_df = pd.DataFrame(dict)
  combined_df.to_csv('scored_sequences.csv')

  return combined_df

In [21]:
def cleanup(ranking):
  """
  Turns tokenized sentences back into regular sentences
  """

  cleaned_seqs = []
  for seq in ranking['sequence']:
    seq = re.sub(r" ([!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~])", r"\1", seq)
    seq = re.sub(r" (n't)", r"\1", seq)
    cleaned_seqs.append(seq)

  ranking['sequence'] = cleaned_seqs
  return ranking

In [22]:
def ranking(filepath, score='sentence_score', ascending=True, child=False):
  """
  Takes scored sequences, ranks them and saves ranked sequences to file.
  score='token_score' if token averaged
  ascending=False if reversed
  child=True if FLA
  """

  scoring = pd.read_csv(filepath)
  scoring = scoring.dropna()

  ranking = scoring.sort_values(by=[score], ascending=ascending)
  ranking = cleanup(ranking)

  writefile = 'ranked_sequences'
  if not ascending:
    writefile += "_reversed"
  if score =='tokens_score':
    writefile += "_token"
  if child:
    writefile += "_child"

  writefile += ".csv"
  ranking.to_csv(writefile)
  return ranking

In [23]:
def random_curriculum(filepath):
  """
  Takes ranked sequences as input, randomly shuffles and saves to file
  """

  scoring = pd.read_csv(filepath)
  scoring = scoring.dropna()
  ranking = scoring.sample(frac = 1)
  ranking = cleanup(ranking)

  writefile = 'random_curriculum.csv'
  ranking.to_csv(writefile)
  return ranking

In [24]:
def filtered_curriculum(filepath):
  '''
  Takes ranked sequences and deleted any with a difficulty score of 0, saves to file
  '''

  sequences = pd.read_csv(filepath)
  filtered = sequences.loc[sequences['sentence_score'] != 0]
  newpath = "filtered_" + filepath
  filtered.to_csv(newpath)

  return filtered

In [34]:
# # pipeline

# scoring('clean_pred3')
# curriculum = ranking('scored_sequences.csv')
# reversed = ranking('ranked_sequences.csv', ascending=False)
# token = ranking('ranked_sequences.csv', score='tokens_score')
# random = random_curriculum('ranked_sequences.csv')
# filtered = filtered_curriculum('ranked_sequences.csv')

In [35]:
# # fla pipeline

# scoring('clean_pred3', child=True)
# child = ranking('scored_sequences.csv', score='tokens_score', child=True)
# child = ranking('scored_sequences.csv', child=True)
# filtered = filtered_curriculum('ranked_sequences_child.csv')
# reversed = ranking('ranked_sequences_child.csv', score='tokens_score', ascending=False)