In [None]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import re
import nltk
import pandas as pd
from nltk.tokenize import PunktSentenceTokenizer
import json
from scoring_utils import diff_score


In [None]:
def scoring(filepath, child=False):
  '''
  Takes file with sequences and scores sequences.
  Returns dataframe with sequence, pos, sentence averaged difficulty score, token averaged difficulty score
  child=True if FLA
  '''

  with open(filepath) as infile:
    full = infile.read()

  combined = full.split("\n")

  sent_tokenizer = PunktSentenceTokenizer(full)

  sequences = []
  posses = []

  for sentence in combined:
    sent = sentence.split("\t")
    if len(sent) > 1:
      sequences.append(sent[0])
      posses.append(sent[1])

  sent_diff_scores = []
  tokens_diff_scores = []
  for seq, pos in zip(sequences, posses):
    if seq == "":
      sent_diff_scores.append(0)
      tokens_diff_scores.append(0)
    else:
      diff_sent, diff_tokens = diff_score(seq, pos, sent_tokenizer, child=child)
      sent_diff_scores.append(diff_sent)
      tokens_diff_scores.append(diff_tokens)

# dictionary of lists
  dict = {'sequence' : sequences,
          'pos' : posses,
          'sentence_score': sent_diff_scores,
          'tokens_score': tokens_diff_scores}

  combined_df = pd.DataFrame(dict)
  name = 'scored_sequences'
  if child:
    name += 'child'
  combined_df.to_csv(name + '.csv')

  return combined_df

In [None]:
def cleanup(ranking):
  """
  Turns tokenized sentences back into regular sentences
  """

  cleaned_seqs = []
  for seq in ranking['sequence']:
    seq = re.sub(r" ([!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~])", r"\1", seq)
    seq = re.sub(r" (n't)", r"\1", seq)
    cleaned_seqs.append(seq)

  ranking['sequence'] = cleaned_seqs
  return ranking

In [None]:
def ranking(filepath, score='sentence_score', ascending=True, child=False):
  """
  Takes scored sequences, ranks them and saves ranked sequences to file.
  score='token_score' if token averaged
  ascending=False if reversed
  child=True if FLA
  """

  scoring = pd.read_csv(filepath)
  scoring = scoring.dropna()

  ranking = scoring.sort_values(by=[score], ascending=ascending)
  ranking = cleanup(ranking)

  writefile = 'ranked_sequences'
  if not ascending:
    writefile += "_reversed"
  if score =='tokens_score':
    writefile += "_token"
  if child:
    writefile += "_child"

  writefile += ".csv"
  ranking.to_csv(writefile)
  return ranking

In [None]:
def random_curriculum(filepath):
  """
  Takes ranked sequences as input, randomly shuffles and saves to file
  """

  scoring = pd.read_csv(filepath)
  scoring = scoring.dropna()
  ranking = scoring.sample(frac = 1)
  ranking = cleanup(ranking)

  writefile = 'random_curriculum.csv'
  ranking.to_csv(writefile)
  return ranking

In [None]:

def filtered_curriculum(filepath):
  '''
  Takes ranked sequences and deletes any with a difficulty score of 0, saves to file
  '''

  sequences = pd.read_csv(filepath)
  filtered = sequences.loc[sequences['sentence_score'] != 0]
  filtered.to_csv("filtered_ranked_sequences.csv")

  return filtered

In [None]:
# pipeline


scoring('data/prep_combined.train')
curriculum = ranking('data/scored_sequences.csv')
reversed = ranking('data/scored_sequences.csv', ascending=False)
token = ranking('data/scored_sequences.csv', score='token_score')
random = random_curriculum('data/ranked_sequences.csv')
filtered = filtered_curriculum('data/ranked_sequences.csv')

scoring('data/prep_combined.train', child=True)
child = ranking('data/scored_sequences_child.csv', child=True)