In [73]:
import numpy as np
import random
import math

In [74]:
fullAlphabet = "abcdefghijklmnopqrstuvwxyz"

In [75]:
# if neg = False, vals will be from 0-1, else -1 - 1
def map_gen(neg=False, mul=1, dim=2, alphabet=fullAlphabet):
    """Generates matrices to represent symbols."""
    symbols = {}
    for char in alphabet:
        symbols[char] = np.random.rand(dim, dim)
    if neg:
        for char in alphabet:
            with np.nditer(symbols[char], op_flags=['readwrite']) as it:
                for x in it:
                    x[...] = 2 * x
                    if x > 1:
                        x[...] = 1 - x
    if mul != 1:
        for char in alphabet:
            with np.nditer(symbols[char], op_flags=['readwrite']) as it:
                for x in it:
                    x[...] = x * mul
    return symbols

In [76]:
def wordMult(word):
  """Converts a word into its matrix form, by multiplying the symbol matrices."""
  if(len(word) == 1): return alphabet.get(word)
  mp = np.dot(alphabet.get(word[len(word)-2]), alphabet.get(word[len(word)-1]))
  if(len(word) == 2): 
    return mp
  for i in range(len(word)-2):
    mp = np.dot(alphabet.get(word[len(word)-i-3]), mp)
  return mp

def getTrace(arr):
  """Get the trace of a matrix (sum of the diagonal elements)."""
  return np.trace(arr)

def checkDensityValidity(Pl, Pr):
  """Check if the density matrices, Pl and Pr, are valid. The trace of their product must be 1."""
  mp = np.dot(Pl, Pr)
  return (getTrace(mp) == 1)

def phi(M, Pl, Pr):
  """Get the estimated frequency of a word. tr(Pl M Pr M*)"""
  M_cross = np.asmatrix(M).getH() # gets complex conjugate transpose
  mp = np.dot(Pr, M_cross)
  mp = np.dot(M, mp)
  mp = np.dot(Pl, M)
  # used to absolute value, but that was because our symbol matrices had negative values :D
  return getTrace(mp)
    
def grabValuesRandom(iterations, high, target):
  """
  Code to find the best Pl and Pr values, only for two dimensions.
  Iterations is exponential, to the seventh power.
  High determines the max value for the elements of Pl and Pr (excluding the one we modify to make the trace of the multiple one).
  Target is a dictionary of words to their target frequencies.
  """
  worstError = -1
  bestError = 1000
  bestPl = None
  bestPr = None
  bestEstimates = {}

  numWordsInDataset = len(target.keys())  
  
  for a in range(iterations):
    a1 = (random.random()) % high
    for b in range(iterations):
      b1 = (random.random()) % high
      for c in range(iterations):
        c1 = (random.random()) % high
        for d in range(iterations):
          d1 = (random.random()) % high
          for e in range(iterations):
            e1 = (random.random()) % high
            for f in range(iterations):
              f1 = (random.random()) % high
              for g in range(iterations):
                g1 = (random.random()) % high
                
                h=(1-(a1*e1+b1*g1+c1*f1))/d1

                m1 = np.array([[a1, b1],[c1, d1]])
                m2 = np.array([[e1, f1],[g1, h]])

                if checkDensityValidity(m1, m2):
                  totalError = 0
                  estimates = {}
                  for z in target.keys():
                    estimatedFrequency = phi(wordMult(z), m1, m2)
                    totalError += (abs(estimatedFrequency-target[z])/target[z])
                    estimates[z] = estimatedFrequency
                  averageError = totalError / numWordsInDataset * 100 # average percent deviation from the expected values

                  if (averageError < bestError):
                    bestEstimates = estimates
                    bestError = averageError
                    bestPl = m1
                    bestPr = m2 
                  if (averageError > worstError):
                    worstError = averageError

  return bestError, bestEstimates, bestPl, bestPr, worstError

In [143]:
alphabet = "art"
targetFrequencies = {"art": 0.15, "rat": 0.15, "at": 0.25, "a": 0.33, "tar": 0.05}

bestError = 1000000
worstError = -1

bestMap = None
bestEstimates = {}

bestPl = None
bestPr = None

numMaps = 10 # how many mapping are we trying
for i in range(numMaps):
  print(f"Iteration: {i}.")
  alphabet = map_gen(alphabet=alphabet)
  best, d, Pl, Pr, worst = grabValuesRandom(iterations=3, high=1, target=targetFrequencies)
  if (best < bestError):
    bestEstimates = d
    bestError = best
    bestMap = alphabet
    bestPl = Pl
    bestPr = Pr
    print(f"Current best error average: {bestError}%.")
  if (worst > worstError):
    worstError = worst

print("Best map:")
print(bestMap)
print("Best Pl:")
print(bestPl)
print("Best Pr:")
print(bestPr)

for word in bestEstimates:
  print(f"Word: {word}.")
  print(f"Real freq: {targetFrequencies[word]}.")
  print(f"Calculated freq: {bestEstimates[word]}.")
  print(f"Deviance: {abs(targetFrequencies[word]-bestEstimates[word])}.\n")

print(f"Best average error: {bestError}%.")
print(f"Worst average error: {worstError}%.")

Iteration: 0.
Current best error average: 65.83537736492629%.
Iteration: 1.
Iteration: 2.
Iteration: 3.
Iteration: 4.
Iteration: 5.
Current best error average: 50.32670412189505%.
Iteration: 6.
Iteration: 7.
Iteration: 8.
Iteration: 9.
Best map:
{'a': array([[0.01358488, 0.3158794 ],
       [0.9007928 , 0.25554769]]), 'r': array([[0.92443527, 0.72292368],
       [0.46932251, 0.98888383]]), 't': array([[0.18153117, 0.05639102],
       [0.48081237, 0.51886914]])}
Best Pl:
[[0.16280841 0.085191  ]
 [0.41660945 0.00395353]]
Best Pr:
[[3.78699015e-01 1.74131803e-01]
 [4.38996094e-01 2.09534402e+02]]
Word: art.
Real freq: 0.15.
Calculated freq: 0.15722210584545543.
Deviance: 0.007222105845455434.

Word: rat.
Real freq: 0.15.
Calculated freq: 0.20690919706896288.
Deviance: 0.056909197068962886.

Word: at.
Real freq: 0.25.
Calculated freq: 0.11885321097434948.
Deviance: 0.13114678902565052.

Word: a.
Real freq: 0.33.
Calculated freq: 0.21155983537395426.
Deviance: 0.11844016462604576.

Word: t

In [134]:
def grabValuesRandom3d(iterations, dimension, high, target):
  """
  Code to find the best Pl and Pr values, for any dimension.
  Iterations is *not* exponential.
  Dimension determines the dimension of Pl and Pr.
  High determines the max value for the elements of Pl and Pr (excluding the one we modify to make the trace of the multiple one).
  Target is a dictionary of words to their target frequencies.
  """
  worstError = -1
  bestError = -1
  bestPl = None
  bestPr = None
  bestEstimates = {}

  numWordsInDataset = len(target.keys())  

  randNumb = []
  for i in range(iterations):
    for j in range(dimension**2*2-1):
      randNumb.append(random.random()%high)
    currAdd = 1
    arrOfArrM1 = [[0 for x in range(dimension)] for y in range(dimension)]
    arrOfArrM2 = [[0 for x in range(dimension)] for y in range(dimension)]
    for p in range(dimension**2-1):
      arrOfArrM1[math.floor(p/dimension)][p%dimension] = randNumb[p]
      arrOfArrM2[p%dimension][math.floor(p/dimension)] = randNumb[dimension**2-2-p]
      currAdd -= randNumb[p]*randNumb[dimension**2-2-p]
    arrOfArrM1[dimension-1][dimension-1] = randNumb[dimension**2-2]

    h = currAdd/randNumb[dimension**2-2]
    arrOfArrM2[dimension-1][dimension-1] = h

    m1 = np.array(arrOfArrM1)
    m2 = np.array(arrOfArrM2)

    #if checkDensityValidity(m1, m2):
    iterations -= 1
    totalError = 0
    estimates = {}
    for z in target.keys():
      estimatedFrequency = phi(wordMult(z), m1, m2)
      totalError += (abs(estimatedFrequency-target[z])/target[z])
      estimates[z] = estimatedFrequency
    averageError = totalError / numWordsInDataset * 100 # average percent deviation from the expected values

    if bestError == -1:
      bestEstimates = estimates
      bestError = averageError
      bestPl = m1
      bestPr = m2
    elif (averageError < bestError):
      bestEstimates = estimates
      bestError = averageError
      bestPl = m1
      bestPr = m2 
    if (averageError > worstError):
      worstError = averageError
  return bestError, bestEstimates, bestPl, bestPr, worstError

In [154]:
highestDim = 6
bestHighs = [0.1, 0.01, 0.001, 0.001, 0.001]

for dimensionality in range(2, highestDim+1):
  print(f"Dimension: {dimensionality}")

  alphabet = "art"
  targetFrequencies = {"art": 0.15, "rat": 0.15, "at": 0.25, "a": 0.33, "tar": 0.05}

  bestError = -1
  worstError = -1

  bestMap = None
  bestEstimates = {}

  bestPl = None
  bestPr = None

  numMaps = 10 # how many mapping are we trying

  for i in range(100):
    for j in range(1, 10):
      alphabet = map_gen(dim=dimensionality, alphabet=alphabet)
      error, d, Pl, Pr, worst = grabValuesRandom3d(iterations=100, dimension=dimensionality, high=bestHighs[dimensionality-2]*j, target=targetFrequencies)
      if bestError == -1:
        bestEstimates = d
        bestError = error
        bestMap = alphabet
        bestPl = Pl
        bestPr = Pr
      elif (error < bestError):
        bestEstimates = d
        bestError = error
        bestMap = alphabet
        bestPl = Pl
        bestPr = Pr
        #print(f"Current best error average: {bestError}%.")
      if (worst > worstError):
        worstError = worst
  
  print("Best map:")
  print(bestMap)
  print("Best Pl:")
  print(bestPl)
  print("Best Pr:")
  print(bestPr)
  
  """
  for word in bestEstimates:
    print(f"Word: {word}.")
    print(f"Real freq: {targetFrequencies[word]}.")
    print(f"Calculated freq: {bestEstimates[word]}.")
    print(f"Deviance: {abs(targetFrequencies[word]-bestEstimates[word])}.\n")
  """
  print(f"Best average error: {bestError}%.")
  print(f"Worst average error: {worstError}%.")

Dimension: 2
Best map:
{'a': array([[0.44265906, 0.89427508],
       [0.97008444, 0.38983834]]), 'r': array([[0.31247124, 0.18009655],
       [0.0577039 , 0.01922285]]), 't': array([[0.28689228, 0.55748547],
       [0.3461649 , 0.62899954]])}
Best Pl:
[[0.01065418 0.00066964]
 [0.18785826 0.18785826]]
Best Pr:
[[1.87858263e-01 1.06541784e-02]
 [6.69644118e-04 5.30185144e+00]]
Best average error: 27.89128962243323%.
Worst average error: 3168.2112840896752%.
Dimension: 3
Best map:
{'a': array([[0.7835471 , 0.69343891, 0.5511123 ],
       [0.56196088, 0.24247371, 0.11988169],
       [0.4215671 , 0.36875172, 0.21138424]]), 'r': array([[0.97318466, 0.58932299, 0.02687449],
       [0.52426396, 0.70988464, 0.32263833],
       [0.47035084, 0.34319765, 0.01869752]]), 't': array([[0.07947563, 0.38820639, 0.93880372],
       [0.22193657, 0.80387457, 0.10403371],
       [0.05572423, 0.82074555, 0.03819936]])}
Best Pl:
[[0.03071279 0.01558239 0.00772459]
 [0.0168384  0.00255966 0.01612806]
 [0.0355

In [166]:
# parsing scrabble dataset, all valid two letter words

scrabbleDatasetString = '“aa”: 0.01, “ab”: 0.01, “ad”: 0.2, “ae”: 0.01, “ag”: 0.01, “ah”: 0.2, “ai”: 0.05, “al”: 0.01, “am”: 0.8, “an”: 0.7, “ar”: 0.02, “as”: 0.7, “at”: 0.8, “aw”: 0.3, “ax”: 0.1, “ay”: 0.2, “ba”: 0.01, “be”: 0.8, “bi”: 0.3, “bo”: 0.05, “by”: 0.8, “da”: 0.01, “de”: 0.01, “do”: 0.8, “ed”: 0.01, “ef”: 0.01, “eh”: 0.2, “el”: 0.01, “em”: 0.01, “en”: 0.01, “er”: 0.2, “es”: 0.01, “et”: 0.01, “ew”: 0.4, “ex”: 0.3, “fa”: 0.01, “fe”: 0.01, “gi”: 0.01, “go”: 0.6, “ha”: 0.5, “he”: 0.5, “hi”: 0.7, “hm”: 0.5, “ho”: 0.1, “id”: 0.1, “if”: 0.8, “in”: 0.8, “is”: 0.8, “it”: 0.8, “jo”: 0.01, “ka”: 0.01, “ki”: 0.01, “la”: 0.01, “li”: 0.01, “lo”: 0.01, “ma”: 0.01, “me”: 0.8, “mi”: 0.01, “mm”: 0.5, “mo”: 0.01, “mu”: 0.01, “my”: 0.8, “na”: 0.01, “ne”: 0.01, “no”: 0.7, “nu”: 0.01, “od”: 0.01, “oe”: 0.01, “of”: 0.8, “oh”: 0.7, “oi”: 0.3, “ok”: 0.6, “om”: 0.1, “on”: 0.8, “op”: 0.01, “or”: 0.8, “os”: 0.01, “ow”: 0.3, “ox”: 0.2, “oy”: 0.1, “pa”: 0.01, “pe”: 0.01, “pi”: 0.3, “po”: 0.01, “qi”: 0.01, “re”: 0.01, “sh”: 0.1, “si”: 0.01, “so”: 0.6, “ta”: 0.01, “te”: 0.01, “ti”: 0.01, “to”: 0.7, “uh”: 0.3, “um”: 0.05, “un”: 0.01, “up”: 0.5, “us”: 0.4, “ut”: 0.1, “we”: 0.7, “wo”: 0.01, “xi”: 0.01, “xu”: 0.01, “ya”: 0.1, “ye”: 0.1, “yo”: 0.2, “za”: 0.01'

scrabbleDatasetArr = scrabbleDatasetString.split(",")
scrabbleDataset = {}
for wordFreqPairString in scrabbleDatasetArr:
    wordFreqPairString = wordFreqPairString.replace("“", "").replace("”", "").replace(":", "").strip()
    vals = wordFreqPairString.split(" ")
    scrabbleDataset[vals[0]] = float(vals[1])

print(scrabbleDataset)

{'aa': 0.01, 'ab': 0.01, 'ad': 0.2, 'ae': 0.01, 'ag': 0.01, 'ah': 0.2, 'ai': 0.05, 'al': 0.01, 'am': 0.8, 'an': 0.7, 'ar': 0.02, 'as': 0.7, 'at': 0.8, 'aw': 0.3, 'ax': 0.1, 'ay': 0.2, 'ba': 0.01, 'be': 0.8, 'bi': 0.3, 'bo': 0.05, 'by': 0.8, 'da': 0.01, 'de': 0.01, 'do': 0.8, 'ed': 0.01, 'ef': 0.01, 'eh': 0.2, 'el': 0.01, 'em': 0.01, 'en': 0.01, 'er': 0.2, 'es': 0.01, 'et': 0.01, 'ew': 0.4, 'ex': 0.3, 'fa': 0.01, 'fe': 0.01, 'gi': 0.01, 'go': 0.6, 'ha': 0.5, 'he': 0.5, 'hi': 0.7, 'hm': 0.5, 'ho': 0.1, 'id': 0.1, 'if': 0.8, 'in': 0.8, 'is': 0.8, 'it': 0.8, 'jo': 0.01, 'ka': 0.01, 'ki': 0.01, 'la': 0.01, 'li': 0.01, 'lo': 0.01, 'ma': 0.01, 'me': 0.8, 'mi': 0.01, 'mm': 0.5, 'mo': 0.01, 'mu': 0.01, 'my': 0.8, 'na': 0.01, 'ne': 0.01, 'no': 0.7, 'nu': 0.01, 'od': 0.01, 'oe': 0.01, 'of': 0.8, 'oh': 0.7, 'oi': 0.3, 'ok': 0.6, 'om': 0.1, 'on': 0.8, 'op': 0.01, 'or': 0.8, 'os': 0.01, 'ow': 0.3, 'ox': 0.2, 'oy': 0.1, 'pa': 0.01, 'pe': 0.01, 'pi': 0.3, 'po': 0.01, 'qi': 0.01, 're': 0.01, 'sh': 0.1,

In [167]:
highestDim = 6
bestHighs = [0.1, 0.01, 0.001, 0.001, 0.001]

for dimensionality in range(2, highestDim+1):
  print(f"Dimension: {dimensionality}")

  alphabet = fullAlphabet
  targetFrequencies = scrabbleDataset

  bestError = -1
  worstError = -1

  bestMap = None
  bestEstimates = {}

  bestPl = None
  bestPr = None

  numMaps = 10 # how many mapping are we trying

  for i in range(100):
    for j in range(1, 10):
      alphabet = map_gen(dim=dimensionality, alphabet=alphabet)
      error, d, Pl, Pr, worst = grabValuesRandom3d(iterations=100, dimension=dimensionality, high=bestHighs[dimensionality-2]*j, target=targetFrequencies)
      if bestError == -1:
        bestEstimates = d
        bestError = error
        bestMap = alphabet
        bestPl = Pl
        bestPr = Pr
      elif (error < bestError):
        bestEstimates = d
        bestError = error
        bestMap = alphabet
        bestPl = Pl
        bestPr = Pr
        #print(f"Current best error average: {bestError}%.")
      if (worst > worstError):
        worstError = worst
  
  print("Best map:")
  print(bestMap)
  print("Best Pl:")
  print(bestPl)
  print("Best Pr:")
  print(bestPr)
  
  """
  for word in bestEstimates:
    print(f"Word: {word}.")
    print(f"Real freq: {targetFrequencies[word]}.")
    print(f"Calculated freq: {bestEstimates[word]}.")
    print(f"Deviance: {abs(targetFrequencies[word]-bestEstimates[word])}.\n")
  """
  print(f"Best average error: {bestError}%.")
  print(f"Worst average error: {worstError}%.")

Dimension: 2
Best map:
{'a': array([[0.35594822, 0.59416068],
       [0.42827645, 0.63042155]]), 'b': array([[0.32230331, 0.30571392],
       [0.56777151, 0.19988284]]), 'c': array([[0.54845763, 0.17155752],
       [0.44374309, 0.79886645]]), 'd': array([[0.36793386, 0.73358717],
       [0.11887731, 0.23066808]]), 'e': array([[0.16532356, 0.08993601],
       [0.50819257, 0.10169074]]), 'f': array([[0.2976446 , 0.2390113 ],
       [0.73318407, 0.8104369 ]]), 'g': array([[0.97860447, 0.47957395],
       [0.66596275, 0.04228747]]), 'h': array([[0.99470815, 0.08145577],
       [0.56499383, 0.16913417]]), 'i': array([[0.20286935, 0.71335812],
       [0.20765103, 0.2275929 ]]), 'j': array([[0.12735039, 0.7655629 ],
       [0.39959509, 0.59254249]]), 'k': array([[0.44364439, 0.39630424],
       [0.33766315, 0.88289545]]), 'l': array([[0.36747898, 0.78593352],
       [0.24647196, 0.90580734]]), 'm': array([[0.98495718, 0.65869172],
       [0.15454647, 0.81002015]]), 'n': array([[0.52504952, 0.