### Read hints and extract expessions

In [2]:
# imports
import os
import copy
import re

In [None]:
# global parameters 
CHUNK_SIZE = 2000
INNER_CHUNK_SIZE = 300
STOP_WORD = " nyradhej "

#### Pre Processing

In [None]:
# writes all hint to scapedHints.txt
%run math-to-speech/scripts/hintScraper.py

In [9]:
scraped_hints_path = '../translation-text-files/pre-translation/scraped-hints/scrapedHints.txt'
scraped_pre_proc_path = '../translation-text-files/pre-translation/scraped-hints/scrapedHintsProc.txt'

In [12]:
def replace_words_in_file(read_path, write_path, words_to_remove, words_replacement):
    if len(words_to_remove) != len(words_replacement):
        raise ValueError("words_to_remove and words_replacement must have the same length.")
    
    # Open the file in read mode
    with open(read_path, 'r', encoding='utf-8') as file:
        # Read the contents of the file
        file_contents = file.read()
    
    # Replace each word in words_to_remove with the corresponding word in words_replacement
    for word_to_remove, word_replacement in zip(words_to_remove, words_replacement):
        # Create a regular expression pattern to match the word to remove
        word_boundary = r'\b' + re.escape(word_to_remove) + r'\b'
        # Replace the word
        file_contents = re.sub(word_boundary, word_replacement, file_contents)
    
    # Open the file in write mode and write the updated contents back to the file
    with open(write_path, 'w', encoding='utf-8') as file:
        file.write(file_contents)


words_to_remove = ['@', ';', '___']
words_replacement = ['', ':', 'blank']
replace_words_in_file(scraped_hints_path, scraped_pre_proc_path, words_to_remove, words_replacement)

In [13]:
with open(scraped_pre_proc_path, 'r', encoding='utf-8') as file:
    hints = file.readlines()
file.close()
print(len(hints))

35896


In [14]:
only_math_hints = []
for hint in hints:
    strings_between_dollars = re.findall(r'\$\$(.*?)\$\$', hint.replace('\n', ''))

    if strings_between_dollars == -1: # no math in this hint
        only_math_hints.append(' ')
    else:
        strings_between_dollars
        only_math_hints.append('@'.join(strings_between_dollars))

print(len(only_math_hints), only_math_hints)

35896 ['', '4@x', '4', '', '', '-2y@4y^2', '-2y@3y', '-2y', '', '', 'p@x', 'p', '', '', 'p@x', 'p', '', '', 'p', 'p', '', '', '4x^2@2x^2', '4x^2@-3x', '4x^2', '', '', '4x@3x^2', '4x@-5x', '4x', '', '', '4', '4', '', '', '6@b', '6', '', '', '-3', '-3', '', '', '-5@p', '-5', '', '', '5@x', '5', '', '', '2@x', '2', '', '', '7@y', '7', '', '', '-3', '-3', '', '', '-8', '-8', '', '', '', '', '', '', '', '', '', '', '-b@b', '-b', '', '', '-y@y', '-y', '', '', '-x@x', '-x', '', '', '-p@p', '-p', '', '', '3@y', '3', '', '', '6r@4r', '6r', '', '', 'y@y', 'y', '', '', 'x@x', 'x', '', '', 'd@d', 'd', '', '', '7x@2x', '7x@y', '', '', '5x@x', '5x@4y', '', '', '2p@6p', '2p@r', '', '', '', '', '', '', 'n', 'n+1', 'n+2', '', '', '', '', '11@3', '', '11=3+2b', '', '', '8=2b', '2', '', '', 'n-6=13', '', 'n', '', '', '', '', '', '120=\\frac{2}{3} n', '', '\\frac{120}{\\frac{2}{3}}', '120\\frac{3}{2}', '', '', '', '', '', '', '', '\\frac{16}{\\frac{2}{5}}', '16\\frac{5}{2}', '', '', '', '3', '', '7=3+2n',

### Partition into chunks

In [77]:
def partition_list(only_math_hints):
    # returns list of shape (len(directories), len(files in directory), INNER_CHUNK_SIZE))
    all_elements = len(only_math_hints)

    n_lists = all_elements // CHUNK_SIZE
    remaining_elements = all_elements % CHUNK_SIZE

    hint_lists = [only_math_hints[i*CHUNK_SIZE:(i+1)*CHUNK_SIZE] for i in range(n_lists)]  # partition all but last elements
    if remaining_elements > 0:
        hint_lists.append(only_math_hints[n_lists*CHUNK_SIZE:])    # add last of all_elements

    # Print the number of hint_lists and their lengths
    print(f"Number of hint_lists (directories): {len(hint_lists)}")

    hint_to_file = []
    for i, hint_list in enumerate(hint_lists):
        elements = len(hint_list)
        # INNER_CHUNK_SIZE = 300

        n_sub_lists = elements // INNER_CHUNK_SIZE
        remaining_elements = elements % INNER_CHUNK_SIZE

        hint_lists_final = [hint_list[i*INNER_CHUNK_SIZE:(i+1)*INNER_CHUNK_SIZE] for i in range(n_sub_lists)]  # partition all but last elements
        if remaining_elements > 0:
            hint_lists_final.append(hint_list[n_sub_lists*INNER_CHUNK_SIZE:])    # add last elements

        hint_to_file.append(hint_lists_final)

    print(f"Number of files in each directory: {len(hint_lists_final)}")
    
    return hint_to_file, hint_lists

hint_to_file, hint_chunks = partition_list(only_math_hints)

Number of hint_lists (directories): 18
Number of files in each directory: 7


#### Make the files for the text chunks

In [30]:
# makes the directories for the translation files

for i in range(len(hint_to_file)): # 0-18
    directory_name = f"translation-text-files/pre-translation/conversion-math/math-conversion{i}" # man kan inte skapa nytt directory så detta borde vara fine att inte skriva över existerande
    try: 
        os.mkdir(directory_name) 
    except:
        print(f"Throw an error if you have already created this folder because you should not overwrite the first creation \nDelete the conversion-math directory if you intend to rewrite it")
        break

    for c, hint_chunk in enumerate(hint_to_file[i]):
        filename = f"{directory_name}/onlyMath{i}-{c}Pre.txt"
        
        with open(filename, 'w',  encoding='utf-8') as file:
            for math_hint in hint_chunk:
                file.write(math_hint + ';')
            file.write(STOP_WORD)
        file.close()

        empty_filename = f"{directory_name}/onlyMath{i}-{c}Post.txt"
        with open(empty_filename, 'w',  encoding='utf-8') as empty_file:
            pass
        empty_file.close()
    

Throw an error if you have already created this folder because you should not overwrite the first creation 
Delete the conversion-math directory if you intend to rewrite it


### After you have pasted MathCAT translations into the math-conversion directory

Change folder_index depending on what folder you are compiling

In [131]:
# change if you want to translate another number of directories
first_directory = 0
last_directory = 3 #len(hint_to_file) 

In [176]:
# run after you have pasted the chunks in folder i from MathCAT

def collect_translations(folder_index, write_filepath):

    if os.path.isfile(write_filepath):
      print(f"File in {write_filepath} already exists, delete it first if you want to rewrite it") # to not write over a file we want to keep
    else:
      with open(write_filepath, 'w',  encoding='utf-8') as empty_file:
                  pass
      empty_file.close()

      with open(write_filepath, 'a',  encoding='utf-8') as write_file:              # opens file to append 
        for c in range(len(hint_to_file[folder_index])): # 7 aka the nr of Post / Pre                                                     # for text-chunk (c) in dir i                  
          read_filepath = f"{directory_name}/onlyMath{folder_index}-{c}Post.txt"              

          with open(read_filepath, 'r',  encoding='utf-8') as read_file:          # read from subfile
              read_info = read_file.readlines()
              write_file.writelines(read_info)
          read_file.close()
      write_file.close()
      print(f"File in {write_filepath} have been created with all converted latex")


def mathCAT_to_list(write_filepath):
  with open(write_filepath, 'r') as file:
      math = file.readline()
  file.close()

  math = math.split('semicolon')
  math = [hint.split('at sign') for hint in math]
  return math


def get_indexes(folder_index, hint_chunks):
  start = CHUNK_SIZE * folder_index
  stop = start + len(hint_chunks[folder_index])
  return (start, stop)
    

def put_math_in_hints(indexes, hints, math):
  hints_done = copy.deepcopy(hints)
  start, stop = indexes
  j = 0

  for i in range(start, stop):          # for each INNER_CHUNK
      math_replacements = math[j]
      for replacement in math_replacements:
          hints_done[i]  = re.sub(r'\$\$.*?\$\$', replacement, hints_done[i], count=1)
      j +=1

  translated_dir = hints_done[start:stop]
  return translated_dir


def check_correctness(indexes, original_hints, translated_hints, collective_filename):
    start, stop = indexes
    stop_translated = stop - start

    print(f"\nHints {start} to {stop} in {collective_filename}__________________________________________________________________________________")
    print(f"Check first correctness:\nOriginal:\t{original_hints[start]}\nTranslated:\t{translated_hints[0]}")
    print(f"Check last correctness:\nOriginal:\t{original_hints[stop-1]}\nTranslated:\t{translated_hints[stop_translated-1]}")
    

def write_final_product(translated_hints, finished_file_path):

  with open(finished_file_path, 'w', encoding='utf-8') as file:
    for translated_hint in translated_hints:
      for hint in translated_hint: 
          file.write(hint)
  file.close()


In [177]:
translated_hints = []
finished_file_path = "../finished-translations/translatedHints.txt"

for folder_index in range(first_directory, last_directory): 
  directory_name = f"../translation-text-files/pre-translation/conversion-math/math-conversion{folder_index}"
  collective_filename = f"post-{folder_index}"
  write_filepath = f"{directory_name}/{collective_filename}.txt"

  collect_translations(folder_index, write_filepath)
  math = mathCAT_to_list(write_filepath)
  indexes = get_indexes(folder_index, hint_chunks)
  translated_hints.append(put_math_in_hints(indexes, hints, math))
  check_correctness(indexes, hints, translated_hints[folder_index], collective_filename)
  write_final_product(translated_hints, finished_file_path)

File in ../translation-text-files/pre-translation/conversion-math/math-conversion0/post-0.txt already exists, delete it first if you want to rewrite it

Hints 0 to 2000 in post-0__________________________________________________________________________________
Check first correctness:
Original:	Multiply the outside value with each of the inside parenthesis values

Translated:	Multiply the outside value with each of the inside parenthesis values

Check last correctness:
Original:	What number to the power of $$3$$ is -512?

Translated:	What number to the power of  3  is -512?

File in ../translation-text-files/pre-translation/conversion-math/math-conversion1/post-1.txt already exists, delete it first if you want to rewrite it

Hints 2000 to 4000 in post-1__________________________________________________________________________________
Check first correctness:
Original:	$${\left(-8\right)}^3=-512$$, so our answer is $$-8$$.

Translated:	open paren negative 8 close paren cubed; is equal t

#### Put math back in full hints (not used)

In [None]:
# # all the converted latex-to-speech have been collected in the post-i file and will later be inserted into the full hints
# # here the mts is read 
# # i = 0
# # directory_name = f"math-conversion{folder_index}"
# # collective_filename = f"post-{folder_index}"
# all_math_filepath = write_filepath #f"{directory_name}/{collective_filename}.txt"

# with open(all_math_filepath, 'r') as file:
#     math = file.readline()
# file.close()

# math = math.split('semicolon')
# math = [hint.split('at sign') for hint in math]

# # print(f"Number of hints: {len(math)-1}")
# # print(math)
# len(math), math

(2001,
 [[''],
  [', x ', ' 3 ', ' x, plus 3; '],
  ['; '],
  ['; 0.5 x is equal to; 0.25 of, open paren x plus 3, close paren; '],
  [' x '],
  [' 3 '],
  [', 3 plus 3, is equal to 6; '],
  ['; '],
  ['; '],
  [', x ', ' 1.2 ', ' x; plus 1.2; '],
  ['; '],
  ['; 5 sixths x, is equal to; fraction, 1 over, 2 times, open paren x plus 1.2, close paren, end fraction; '],
  [' x '],
  [' 1.2 '],
  [', 1.8 plus 1.2, is equal to 3; '],
  ['; '],
  ['; '],
  [', x ', ' 4 ', ' x, plus 4; '],
  ['; '],
  ['; 0.75 x is equal to; 0.5 of, open paren x plus 4, close paren; '],
  [' x '],
  [' 4 '],
  [', 8 plus 4, is equal to 12; '],
  ['; '],
  ['; '],
  ['; '],
  [' x; '],
  ['; '],
  [' 2.5 ', ' 2.5, '],
  [', distance is equal to speedtime; '],
  ['; 2 x plus 1.75 of 2.5 x; is equal to 255; '],
  [' x '],
  [' 1.75 '],
  [', 40 times 1.75, is equal to 70; '],
  ['; '],
  ['; '],
  [' x; '],
  ['; '],
  ['; 1.5 ', ' 2.25 ', ' 1.5 ', ' 2.25; '],
  [', distance is equal to speedtime; '],
  ['; 1.5 

In [None]:
# CHUNK_SIZE = 2000
# hints_done = copy.deepcopy(hints)
# start = CHUNK_SIZE * folder_index
# stop = start + len(hint_lists[folder_index]) # -1
# j = 0

# for i in range(start, stop):
#     math_replacements = math[j]
#     for replacement in math_replacements:
#         hints_done[i]  = re.sub(r'\$\$.*?\$\$', replacement, hints_done[i], count=1)
#     j +=1

# translated_dir = hints_done[start:stop]
# print(f"Check scrapedHints {start+1} - {stop}")
# len(translated_dir), translated_dir

Check scrapedHints 20001 - 22000


(2000,
 ["You are solving for the Katie Mae's walking speed.\n",
  "Create a variable to represent the walking speed. Let's call the this speed , x . Since her biking speed is  3  mph faster than her walking speed, her biking speed is  x, plus 3; .\n",
  'Translate the situation into an equation using the variable. Remember that the distance from Katie Mae’s home to her school is the same whether she is walking or riding her bike, and distance equals the product of speed and time. Also, you will need to first convert the minutes into hours.\n',
  'The translated equation is ; 0.5 x is equal to; 0.25 of, open paren x plus 3, close paren; .\n',
  'After solving the equation, what do you get for  x ?\n',
  "Katie Mae's walking speed is  3  mph less than her biking speed.\n",
  'Her biking speed is , 3 plus 3, is equal to 6;  mph.\n',
  'Identify what you are solving for.\n',
  'You are solving for the uphill hiking speed of Suzy.\n',
  "Create a variable to represent the uphill hiking spe

In [None]:
# finished_file = f"finishedHints-{folder_index}.txt"

# with open(finished_file, 'w', encoding='utf-8') as file:
#     for hint in translated_dir: 
#         file.write(hint)
# file.close()

### Collect all files thus far (not used)

In [None]:
# n_folders_thus = 18
# file_names = [f'finishedHints-{finished_index}.txt' for finished_index in range(n_folders_thus)]
# file_names

['finishedHints-0.txt',
 'finishedHints-1.txt',
 'finishedHints-2.txt',
 'finishedHints-3.txt',
 'finishedHints-4.txt',
 'finishedHints-5.txt',
 'finishedHints-6.txt',
 'finishedHints-7.txt',
 'finishedHints-8.txt',
 'finishedHints-9.txt',
 'finishedHints-10.txt',
 'finishedHints-11.txt',
 'finishedHints-12.txt',
 'finishedHints-13.txt',
 'finishedHints-14.txt',
 'finishedHints-15.txt',
 'finishedHints-16.txt',
 'finishedHints-17.txt']

In [None]:
# combine all translated hints
# combined_file_name = 'combined_finishedHints.txt'

# # Open the combined text file in write mode
# with open(combined_file_name, 'w', encoding='utf-8') as combined_file:
#     # Iterate over each file name
#     for file_name in file_names:
#         # Open each file in read mode
#         with open(file_name, 'r', encoding='utf-8') as file:
#             # Read the contents of the file
#             contents = file.read()
#             # Write the contents to the combined file
#             combined_file.write(contents)
#         # Close the file
#         file.close()

# # Close the combined file
# combined_file.close()

# print(f"All files have been combined into {combined_file_name}.")

All files have been combined into combined_finishedHints.txt.


### Post processing

In [154]:
def format_latex_for_json(hints):
    dollar_math = []
    for hint in hints:
        strings_and_dollars = re.findall(r'\$\$(.*?)\$\$', hint.replace('\n', ''))

        if strings_and_dollars == -1: # no math in this hint
            dollar_math.append(' ')
        else:
            dollar_math.append('@'.join(['$$' + string + '$$' for string in strings_and_dollars]))

    # write to math file
    math_filename = '../finished_translations/math.txt'
    with open(math_filename, 'w', encoding='utf-8') as file:
        for math_line in dollar_math:
            file.write(math_line + '\n')
    file.close()


# remove stop word and annoying words

def remove_words_from_file(read_file, write_file, words_to_remove):
    # Open the file in read mode
    with open(read_file, 'r', encoding='utf-8') as file:
        # Read the contents of the file
        file_contents = file.read()
    
    # Create a regular expression pattern to match all words to remove
    word_boundary = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b'
    updated_contents = re.sub(word_boundary, '', file_contents)
    
    # Open the file in write mode and write the updated contents back to the file
    with open(write_file, 'w',encoding='utf-8') as file:
        file.write(updated_contents)


In [155]:
write_path = "../finished-translations/translatedHintsPost.txt"
words_to_remove = ['cap', 'open paren', 'close paren', 'end fraction', ';', STOP_WORD]
remove_words_from_file(finished_file_path, write_path, words_to_remove)
format_latex_for_json(hints)

FileNotFoundError: [Errno 2] No such file or directory: '../finished_translations/math.txt'

In [None]:
# %run math-to-speech/scripts/mtsWriter.py # writes translated hints to content-source 

In [170]:
# before putting the math in the hints
# where math is extracted in list 

collect_translations(folder_index, write_filepath)
math = mathCAT_to_list(write_filepath)

[['Multiply the outside value with each of the inside parenthesis values\n',
  'What is  4  times  x, ?\n',
  'What is  4;  times 3?\n',
  'Combine the values multiplied\n',
  'Multiply the outside value with each of the inside parenthesis values\n',
  'What is ; minus, 2 y  times  4 y squared; ?\n',
  'What is ; minus, 2 y  times  3 y; ?\n',
  'What is ; minus 2 y;  times -5?\n',
  'Combine the values multiplied\n',
  'Multiply the outside value with each of the inside parenthesis values\n',
  'What is  p  times  x, ?\n',
  'What is  p;  times 3?\n',
  'Combine the values multiplied\n',
  'Multiply the outside value with each of the inside parenthesis values\n',
  'What is  p  times  x, ?\n',
  'What is  p;  times 8?\n',
  'Combine the values multiplied\n',
  'Multiply the outside value with each of the inside parenthesis values\n',
  'What is  p  times xa?\n',
  'What is  p;  times 4?\n',
  'Combine the values multiplied\n',
  'Multiply the outside value with each of the inside paren

In [169]:
# for parsed speech to be lists instead of strings

# 
def self_paced_parsing():
    """ Instead of saying full hint string it is parsed for each expression.
    eg. 'What is $$5$$ times $$2$$?' -> ['What is ,' times ', '?'] and ['5','2']
    """
    only_math_hints = []
    only_text_hints = []
    for hint in hints:
        strings_not_between_dollars = re.split(r'\$\$.*?\$\$', hint.replace('\n', ''))#re.findall(r'(?<!\$)\$(.*?)(?<!\$)\$', hint.replace('\n', ''))
        only_text_hints.append(strings_not_between_dollars)
        strings_between_dollars = re.findall(r'\$\$(.*?)\$\$', hint.replace('\n', ''))

        if strings_between_dollars == -1: # if no $$ found
            only_math_hints.append(' ')
        else:
            only_math_hints.append('@'.join(strings_between_dollars))

    print(len(only_math_hints[:10]), only_math_hints[:10])
    print(len(only_text_hints[:10]), only_text_hints[:10])
    return only_math_hints, only_text_hints


def 

only_math_hints, only_text_hints = self_paced_parsing()

10 ['', '4@x', '4', '', '', '-2y@4y^2', '-2y@3y', '-2y', '', '']
10 [['Multiply the outside value with each of the inside parenthesis values'], ['What is ', ' times ', '?'], ['What is ', ' times 3?'], ['Combine the values multiplied'], ['Multiply the outside value with each of the inside parenthesis values'], ['What is ', ' times ', '?'], ['What is ', ' times ', '?'], ['What is ', ' times -5?'], ['Combine the values multiplied'], ['Multiply the outside value with each of the inside parenthesis values']]


### Old writing to files and math

In [None]:
# for expressions to be added in json math attribute
 
dollar_math = []
for hint in hints:
    strings_and_dollars = re.findall(r'\$\$(.*?)\$\$', hint.replace('\n', ''))

    if strings_and_dollars == -1: # no math in this hint
        dollar_math.append(' ')
    else:
        dollar_math.append('@'.join(['$$' + string + '$$' for string in strings_and_dollars]))

print(len(dollar_math), dollar_math)


35896 ['', '$$4$$@$$x$$', '$$4$$', '', '', '$$-2y$$@$$4y^2$$', '$$-2y$$@$$3y$$', '$$-2y$$', '', '', '$$p$$@$$x$$', '$$p$$', '', '', '$$p$$@$$x$$', '$$p$$', '', '', '$$p$$', '$$p$$', '', '', '$$4x^2$$@$$2x^2$$', '$$4x^2$$@$$-3x$$', '$$4x^2$$', '', '', '$$4x$$@$$3x^2$$', '$$4x$$@$$-5x$$', '$$4x$$', '', '', '$$4$$', '$$4$$', '', '', '$$6$$@$$b$$', '$$6$$', '', '', '$$-3$$', '$$-3$$', '', '', '$$-5$$@$$p$$', '$$-5$$', '', '', '$$5$$@$$x$$', '$$5$$', '', '', '$$2$$@$$x$$', '$$2$$', '', '', '$$7$$@$$y$$', '$$7$$', '', '', '$$-3$$', '$$-3$$', '', '', '$$-8$$', '$$-8$$', '', '', '', '', '', '', '', '', '', '', '$$-b$$@$$b$$', '$$-b$$', '', '', '$$-y$$@$$y$$', '$$-y$$', '', '', '$$-x$$@$$x$$', '$$-x$$', '', '', '$$-p$$@$$p$$', '$$-p$$', '', '', '$$3$$@$$y$$', '$$3$$', '', '', '$$6r$$@$$4r$$', '$$6r$$', '', '', '$$y$$@$$y$$', '$$y$$', '', '', '$$x$$@$$x$$', '$$x$$', '', '', '$$d$$@$$d$$', '$$d$$', '', '', '$$7x$$@$$2x$$', '$$7x$$@$$y$$', '', '', '$$5x$$@$$x$$', '$$5x$$@$$4y$$', '', '', '$$2p$$@$

In [None]:
# write to math file
math_filename = '../math.txt'
with open(math_filename, 'w', encoding='utf-8') as file:
    for math_line in dollar_math:
        file.write(math_line + '\n')
file.close()

In [None]:
# remove stop word and annoying words

def remove_words_from_file(file_path, words_to_remove):
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the contents of the file
        file_contents = file.read()
    
    # Create a regular expression pattern to match all words to remove
    word_boundary = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b'
    updated_contents = re.sub(word_boundary, '', file_contents)
    
    # Open the file in write mode and write the updated contents back to the file
    with open(file_path, 'w') as file:
        file.write(updated_contents)

# Example usage
file_path = 'example.txt'
words_to_remove = ['cap', 'open paren', 'close paren', 'end fraction', ';']
remove_words_from_file(file_path, words_to_remove)