### Global variables sunmmary  

**hints** = original scraped hints with latex and text  
**only_math_hints** = list of only latex expressions (no $$)  

**hint_to_file** = latex hints partitioned into N directories for easier manual     translation process  
**hint_chunks** = same as hint_to_file but here there are subfolders with eg 300 expressions per file since MathCAT can only handle that amount  



### Read hints and extract expessions

In [2]:
# imports
import os
import copy
import re

In [7]:
# global parameters 
CHUNK_SIZE = 2000
INNER_CHUNK_SIZE = 300
STOP_WORD = " nyradhej "

#### Pre Processing

In [None]:
# writes all hint to scapedHints.txt
%run math-to-speech/scripts/hintScraper.py

In [8]:
scraped_hints_path = '../translation-text-files/pre-translation/scraped-hints/scrapedHints.txt'
scraped_pre_proc_path = '../translation-text-files/pre-translation/scraped-hints/scrapedHintsProc.txt'

In [9]:
def replace_words_in_file(read_path, write_path, words_to_remove, words_replacement):
    if len(words_to_remove) != len(words_replacement):
        raise ValueError("words_to_remove and words_replacement must have the same length.")
    
    # Open the file in read mode
    with open(read_path, 'r', encoding='utf-8') as file:
        # Read the contents of the file
        file_contents = file.read()
    
    # Replace each word in words_to_remove with the corresponding word in words_replacement
    for word_to_remove, word_replacement in zip(words_to_remove, words_replacement):
        # Create a regular expression pattern to match the word to remove
        word_boundary = r'\b' + re.escape(word_to_remove) + r'\b'
        # Replace the word
        file_contents = re.sub(word_boundary, word_replacement, file_contents)
    
    # Open the file in write mode and write the updated contents back to the file
    with open(write_path, 'w', encoding='utf-8') as file:
        file.write(file_contents)
    print(f"{words_to_remove} have been replaced by {words_replacement}")


words_to_remove = ['@', ';', '___']
words_replacement = ['', ':', 'blank']
replace_words_in_file(scraped_hints_path, scraped_pre_proc_path, words_to_remove, words_replacement)

['@', ';', '___'] have been replaced by ['', ':', 'blank']


In [10]:
with open(scraped_pre_proc_path, 'r', encoding='utf-8') as file:
    hints = file.readlines()
print(len(hints))

35896


In [11]:
only_math_hints = []
for hint in hints:
    strings_between_dollars = re.findall(r'\$\$(.*?)\$\$', hint.replace('\n', ''))

    if strings_between_dollars == -1: # no math in this hint
        only_math_hints.append(' ')
    else:
        strings_between_dollars
        only_math_hints.append('@'.join(strings_between_dollars))

print(len(only_math_hints), only_math_hints)

35896 ['', '4@x', '4', '', '', '-2y@4y^2', '-2y@3y', '-2y', '', '', 'p@x', 'p', '', '', 'p@x', 'p', '', '', 'p', 'p', '', '', '4x^2@2x^2', '4x^2@-3x', '4x^2', '', '', '4x@3x^2', '4x@-5x', '4x', '', '', '4', '4', '', '', '6@b', '6', '', '', '-3', '-3', '', '', '-5@p', '-5', '', '', '5@x', '5', '', '', '2@x', '2', '', '', '7@y', '7', '', '', '-3', '-3', '', '', '-8', '-8', '', '', '', '', '', '', '', '', '', '', '-b@b', '-b', '', '', '-y@y', '-y', '', '', '-x@x', '-x', '', '', '-p@p', '-p', '', '', '3@y', '3', '', '', '6r@4r', '6r', '', '', 'y@y', 'y', '', '', 'x@x', 'x', '', '', 'd@d', 'd', '', '', '7x@2x', '7x@y', '', '', '5x@x', '5x@4y', '', '', '2p@6p', '2p@r', '', '', '', '', '', '', 'n', 'n+1', 'n+2', '', '', '', '', '11@3', '', '11=3+2b', '', '', '8=2b', '2', '', '', 'n-6=13', '', 'n', '', '', '', '', '', '120=\\frac{2}{3} n', '', '\\frac{120}{\\frac{2}{3}}', '120\\frac{3}{2}', '', '', '', '', '', '', '', '\\frac{16}{\\frac{2}{5}}', '16\\frac{5}{2}', '', '', '', '3', '', '7=3+2n',

### Partition into chunks

In [12]:
def partition_list(only_math_hints):
    """ Partitions to make manual translation easier. Returns list of shape (len(directories), len(files in directory), INNER_CHUNK_SIZE))"""
    all_elements = len(only_math_hints)

    n_lists = all_elements // CHUNK_SIZE
    remaining_elements = all_elements % CHUNK_SIZE

    hint_lists = [only_math_hints[i*CHUNK_SIZE:(i+1)*CHUNK_SIZE] for i in range(n_lists)]  # partition all but last elements
    if remaining_elements > 0:
        hint_lists.append(only_math_hints[n_lists*CHUNK_SIZE:])    # add last of all_elements

    # Print the number of hint_lists and their lengths
    print(f"Number of hint_lists (directories): {len(hint_lists)}")

    hint_to_file = []
    for i, hint_list in enumerate(hint_lists):
        elements = len(hint_list)
        # INNER_CHUNK_SIZE = 300

        n_sub_lists = elements // INNER_CHUNK_SIZE
        remaining_elements = elements % INNER_CHUNK_SIZE

        hint_lists_final = [hint_list[i*INNER_CHUNK_SIZE:(i+1)*INNER_CHUNK_SIZE] for i in range(n_sub_lists)]  # partition all but last elements
        if remaining_elements > 0:
            hint_lists_final.append(hint_list[n_sub_lists*INNER_CHUNK_SIZE:])    # add last elements

        hint_to_file.append(hint_lists_final)

    print(f"Number of files in each directory: {len(hint_lists_final)}")
    
    return hint_to_file, hint_lists

hint_to_file, hint_chunks = partition_list(only_math_hints)

Number of hint_lists (directories): 18
Number of files in each directory: 7


#### Make the files for the text chunks

In [13]:
# makes the directories for the translation files

for i in range(len(hint_to_file)): # 0-18
    directory_name = f"translation-text-files/pre-translation/conversion-math/math-conversion{i}"
    try: 
        os.mkdir(directory_name) 
    except:
        print(f"Throw an error if you have already created this folder because you should not overwrite the first creation \nDelete the conversion-math directory if you intend to rewrite it")
        break

    for c, hint_chunk in enumerate(hint_to_file[i]):
        filename = f"{directory_name}/onlyMath{i}-{c}Pre.txt"
        
        # Pre files where parsed latex is written
        with open(filename, 'w',  encoding='utf-8') as file:
            for math_hint in hint_chunk:
                file.write(math_hint + ';')
            file.write(STOP_WORD)

        # Post files where MathCat tanslation will be written
        empty_filename = f"{directory_name}/onlyMath{i}-{c}Post.txt"
        with open(empty_filename, 'w',  encoding='utf-8') as empty_file:
            pass
    

Throw an error if you have already created this folder because you should not overwrite the first creation 
Delete the conversion-math directory if you intend to rewrite it


### After you have pasted MathCAT translations into the math-conversion directory

Change folder_index depending on what folder you are compiling. For all folders user len(hint_to_file)

In [14]:
# change if you want to translate another number of directories
first_directory = 0
last_directory = len(hint_to_file) 

In [15]:
# run after you have pasted the chunks in folder i from MathCAT

def collect_translations(folder_index, write_filepath):
    """For dir i collect all translation files into one file, eg post-0.txt"""

    if os.path.isfile(write_filepath):
      print(f"File in {write_filepath} already exists, delete it first if you want to rewrite it") # to not write over a file we want to keep
    else:
      with open(write_filepath, 'w',  encoding='utf-8') as empty_file:
        pass

      with open(write_filepath, 'a',  encoding='utf-8') as write_file:              # opens file to append 
        for c in range(len(hint_to_file[folder_index])):                                                    # for text-chunk (c) in dir i                  
          read_filepath = f"{directory_name}/onlyMath{folder_index}-{c}Post.txt"              

          with open(read_filepath, 'r',  encoding='utf-8') as read_file:          # read from subfile
              read_info = read_file.readlines()
              write_file.writelines(read_info)
        
      print(f"File in {write_filepath} have been created with all converted latex")


def mathCAT_to_list(post_filepath):
  """ Revert parsing with @ and ; as list of lists"""
  between_hints = 'semicolon'
  between_expressions = 'at sign'

  with open(post_filepath, 'r') as file:
      math = file.readline()

  math = math.split(between_hints)
  math = [hint.split(between_expressions) for hint in math]
  return math


def get_indexes(folder_index, hint_chunks):
  """ Get hint indexes (row indexes) for current directory"""
  start = CHUNK_SIZE * folder_index
  stop = start + len(hint_chunks[folder_index])
  return (start, stop)
    

def put_math_in_hints(indexes, math):
  """ Insert converted math back into the full hints"""
  hints_done = copy.deepcopy(hints)
  start, stop = indexes
  j = 0
  try:
    for i in range(start, stop):          # for each INNER_CHUNK
        math_replacements = math[j]

        hints_done[i] = hints_done[i].replace('\\n', '')  # sometimes \n was written in the hint, removes it.

        for replacement in math_replacements:
            hints_done[i]  = re.sub(r'\$\$.*?\$\$', replacement, hints_done[i], count=1)
          
        j +=1
  except Exception as e:
    # Throws bad escape error at \\ left in math, corrects to blank
    print(f"\nERROR at {i} for String: {hints_done[i]} \nMath:{replacement}")

    bad_replacement = replacement
    for rep in math_replacements:
      if rep == bad_replacement:
        correction = " "
        hints_done[i]  = re.sub(r'\$\$.*?\$\$', correction, hints_done[i], count=1)
      else:
        hints_done[i]  = re.sub(r'\$\$.*?\$\$', rep, hints_done[i], count=1)
    
  translated_dir = hints_done[start:stop]
  return translated_dir


def check_correctness(indexes, original_hints, translated_hints, post_filepath):
  """ Let you at the first and last translation of directory to see hint has been put back together correctly"""
  start, stop = indexes
  stop_translated = stop - start

  print(f"\nHints {start} to {stop} in {post_filepath[50:]}__________________________________________________________________________________")
  print(f"Check first correctness:\nOriginal:\t{original_hints[start]}\nTranslated:\t{translated_hints[0]}")
  print(f"Check last correctness:\nOriginal:\t{original_hints[stop-1]}\nTranslated:\t{translated_hints[stop_translated-1]}")
    

def write_final_product(translated_hints, finished_file_path):

  with open(finished_file_path, 'w', encoding='utf-8') as file:
    for translated_hint in translated_hints:
      for hint in translated_hint: 
          file.write(hint)


def post_filename(folder_index):
  """ Generates filepath where all files in eg math-conversion0  will be combined"""
  directory_name = f"../translation-text-files/pre-translation/conversion-math/math-conversion{folder_index}"
  collective_filename = f"post-{folder_index}"
  write_filepath = f"{directory_name}/{collective_filename}.txt"
  return write_filepath



In [19]:

def translation_process(first_directory, last_directory):
  """ After pasteing all MathCAT text run this to have final product of translation written to file translatedHints.txt"""
  translated_hints = []

  for folder_index in range(first_directory, last_directory): 

    # Collect all translations from MathCAT
    post_filepath = post_filename(folder_index)
    collect_translations(folder_index, post_filepath)

    # Put translated latex back in full hints
    math = mathCAT_to_list(post_filepath)
    indexes = get_indexes(folder_index, hint_chunks)
    translated_dir = put_math_in_hints(indexes, math)
    translated_hints.append(translated_dir)

    # Check correctness and save
    check_correctness(indexes, hints, translated_hints[folder_index], post_filepath)
  write_final_product(translated_hints, finished_file_path)

finished_file_path = "../finished-translations/translatedHints.txt"
translation_process(first_directory, last_directory)

File in ../translation-text-files/pre-translation/conversion-math/math-conversion0/post-0.txt already exists, delete it first if you want to rewrite it

Hints 0 to 2000 in on-math/math-conversion0/post-0.txt__________________________________________________________________________________
Check first correctness:
Original:	Multiply the outside value with each of the inside parenthesis values

Translated:	Multiply the outside value with each of the inside parenthesis values

Check last correctness:
Original:	What number to the power of $$3$$ is -512?

Translated:	What number to the power of  3  is -512?

File in ../translation-text-files/pre-translation/conversion-math/math-conversion1/post-1.txt already exists, delete it first if you want to rewrite it

Hints 2000 to 4000 in on-math/math-conversion1/post-1.txt__________________________________________________________________________________
Check first correctness:
Original:	$${\left(-8\right)}^3=-512$$, so our answer is $$-8$$.

Trans

### Post processing

In [26]:
def format_latex_for_json(hints):
    """ To render latex expression they are wrapped in $$ and @ is used to split them apart for the same hint"""
    dollar_math = []
    for hint in hints:
        strings_and_dollars = re.findall(r'\$\$(.*?)\$\$', hint.replace('\n', ''))
        if strings_and_dollars == -1: # no math in this hint
            dollar_math.append(' ')
        else:
            dollar_math.append('@'.join(['$$' + string + '$$' for string in strings_and_dollars]))
    
    # save
    math_filename =  "../finished-translations/math.txt"
    with open(math_filename, 'w', encoding='utf-8') as file:
        for math_line in dollar_math:
            file.write(math_line + '\n')


def remove_words_from_file(read_file, write_file, words_to_remove, symbols_to_remove):
    """ For more user friendly spoken version annoying/abundant words are removed"""

    with open(read_file, 'r', encoding='utf-8') as file:
        file_contents = file.read()
    
    # remove all occurances of words from words_to_remove
    word_boundary = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b'
    updated_contents = re.sub(word_boundary, '', file_contents)

    # remove all symbols from symbols_to_remove
    for symbol in symbols_to_remove:
        updated_contents = updated_contents.replace(symbol, '')
    
    # remove stop word (even though it can be )
    stop_word = re.escape(STOP_WORD)
    stop_word = r'(' + stop_word + r')(.)?'
    updated_contents = re.sub(stop_word, '', updated_contents)

    # save 
    with open(write_file, 'w',encoding='utf-8') as file:
        file.write(updated_contents)

# post processing here
write_path = "../finished-translations/translatedHintPost.txt" 
words_to_remove = ['cap', 'open paren', 'close paren', 'end fraction']
symbols_to_remove = [';']
remove_words_from_file(finished_file_path, write_path, words_to_remove, symbols_to_remove)
format_latex_for_json(hints)

In [None]:
# %run math-to-speech/scripts/mtsWriter.py # writes translated hints to content-source 

### Self paced version
Instead of speech: ['This is a hint with number 4 and 5']  
We get: ['This is a hint with number 4, 'and 5']  
To have a paced UI when the VA speeks

Only thing left to solve is the post processing... should be done before parsing hints back together 

In [176]:
# before putting the math in the hints
# where math is extracted in list 

def self_paced_parsing():
    """ Instead of saying full hint string it is parsed for each expression.
    eg. 'What is $$5$$ times $$2$$?' -> ['What is ,' times ', '?'] and ['5','2']
    """
    only_math_hints = []
    only_text_hints = []
    for hint in hints:
        strings_not_between_dollars = re.split(r'\$\$.*?\$\$', hint.replace('\n', ''))#re.findall(r'(?<!\$)\$(.*?)(?<!\$)\$', hint.replace('\n', ''))
        only_text_hints.append(strings_not_between_dollars)
        strings_between_dollars = re.findall(r'\$\$(.*?)\$\$', hint.replace('\n', ''))

        if strings_between_dollars == -1: # if no $$ found
            only_math_hints.append(' ')
        else:
            only_math_hints.append('@'.join(strings_between_dollars))

    return only_math_hints, only_text_hints


def paced_math_in_hints(indexes, math):
  """ Insert converted math back into the full paced hints"""
  hints_done = copy.deepcopy(only_text_hints)
  start, stop = indexes
  j = 0    # index for math

  for i in range(start, stop):          # for each INNER_CHUNK
      math_replacements = math[j]
      for r, expression in enumerate(math_replacements):
            # [ [text + math], [text + math]]
          hints_done[i][r]  = f"{hints_done[i][r]} {expression}"   
      # remove ['.'] occurances
      if '.' in hints_done[i]: hints_done[i].remove('.')
      j +=1

  translated_dir = hints_done[start:stop]
  return translated_dir

def paced_write_final_product(translated_hints, finished_file_path):

  with open(finished_file_path, 'w', encoding='utf-8') as file:
    for translated_hint in translated_hints:
      for hint in translated_hint: 
          file.write(hint.join()) # can't join on @ since varaiabilization uses that... 


def paced_translation_process(first_directory, last_directory):
  """ After pasteing all MathCAT text run this to have final product of translation written to file translatedPacedHints.txt"""
  translated_hints = []

  for folder_index in range(first_directory, last_directory): 
    post_filepath = post_filename(folder_index)

    collect_translations(folder_index, post_filepath)
    math = mathCAT_to_list(post_filepath)
    indexes = get_indexes(folder_index, hint_chunks)
    # here paced changes are implemented
    translated_hints.append(paced_math_in_hints(indexes, math))

    check_correctness(indexes, hints, translated_hints[folder_index], post_filepath)
  # paced_write_final_product(translated_hints, finished_file_path)

finished_file_path = "../finished-translations/translatedPacedHints.txt"
only_math_hints, only_text_hints = self_paced_parsing()
paced_translation_process(first_directory, last_directory)

File in ../translation-text-files/pre-translation/conversion-math/math-conversion0/post-0.txt already exists, delete it first if you want to rewrite it

Hints 0 to 2000 in __________________________________________________________________________________
Check first correctness:
Original:	Multiply the outside value with each of the inside parenthesis values

Translated:	['Multiply the outside value with each of the inside parenthesis values ']
Check last correctness:
Original:	What number to the power of $$3$$ is -512?

Translated:	['What number to the power of   3 ', ' is -512?']
File in ../translation-text-files/pre-translation/conversion-math/math-conversion1/post-1.txt already exists, delete it first if you want to rewrite it

Hints 2000 to 4000 in __________________________________________________________________________________
Check first correctness:
Original:	$${\left(-8\right)}^3=-512$$, so our answer is $$-8$$.

Translated:	[' open paren negative 8 close paren cubed; is equa