## Observations

My best human guess is that the middle name is about 6 letters long.
Some other observations include 
- that it probably does not contain anything that goes above the highlight lines, i.e. d,b,l,f,h unless it is about at the end
- Second letter is likely either an a or r, as not too many things could go in 

By using a particular font, and measuring the avg width of each letter, we can approximate how much of a width we may get from each letter, sadly, T interacts differently with each letter.
T followed by another letter sometimes reduces the pixel width by 3-8 pixels, I tried to avg out the error, but it differs a little wildly, I could probably measure for each as its only 26 values, to truly determine and get accurate measurements.

### Read raw

In [1]:
RAW_SURNAMES = './surnamesRaw.csv'
CHAR_JSON = './letters.json' # file with pixel width of letters at 60px, bold, inter-tight after the T as it eats up some pixels
CHAR_2ND_JSON = './secondLetters.json'
CENSUS_T_NAMES = './censusTNames.csv'

In [2]:
import pandas as pd

pd.read_csv(RAW_SURNAMES)

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2376206,880.85,880.85,73.35,22.22,0.4,0.85,1.63,1.56
1,JOHNSON,2,1857160,688.44,1569.30,61.55,33.8,0.42,0.91,1.82,1.5
2,WILLIAMS,3,1534042,568.66,2137.96,48.52,46.72,0.37,0.78,2.01,1.6
3,BROWN,4,1380145,511.62,2649.58,60.71,34.54,0.41,0.83,1.86,1.64
4,JONES,5,1362755,505.17,3154.75,57.69,37.73,0.35,0.94,1.85,1.44
...,...,...,...,...,...,...,...,...,...,...,...
151666,YOUSKO,150436,100,0.04,89752.93,99,(S),0,0,0,(S)
151667,ZAITSEV,150436,100,0.04,89753.04,92,(S),0,0,7,(S)
151668,ZALLA,150436,100,0.04,89753.11,99,(S),0,0,0,(S)
151669,ZERBEY,150436,100,0.04,89753.30,99,(S),0,0,0,(S)


### Build freq and edge map

Build freq and edge map, of letters that appear after one another.

In [3]:
alph = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]

# safety get
def getChar(arr, idx, default=None):
    if idx >= len(arr):
        return default
    return arr[idx]

freqMap = {}

with open(RAW_SURNAMES) as surnameFile:
    firstline = surnameFile.readline()
#     j = 0
    for line in surnameFile:
#         if j == 500: break
        name = line.strip().split(',')[0].lower()
#         print(name)
        for i in range(len(name)):
            curChar = name[i]
            nextChar = getChar(name, i+1)
            if nextChar:
                innerDict = freqMap.get(curChar, {})
                innerDict.update({ nextChar: innerDict.get(nextChar, 0) + 1 })
                freqMap[curChar] = innerDict
#         j += 1
            
        
        

### Name Construction using DFS

In [6]:
MAX_WIDTH = 203.8 # +/- some width for flexibility, can bound further later on to hone in potentially
PLUS = 3 # pixel error allowed
MINUS = 11 # pixel error allowed

isWithinBound = lambda x: x >= MAX_WIDTH-MINUS and x <= MAX_WIDTH+PLUS
isUnderBound = lambda x: x < MAX_WIDTH-MINUS

def getNameLength(name, map1, map2):
    length = 0
    for i in range(len(name)):
        if i == 1:
            length += map2[name[i]]
        else:
            length += map1[name[i]]
    return length

def extendStack(stack, items, depth):
    if items:
        for item in items:
            stack.append((item, depth))
            
def isMoreThanTwoRepeatChars(word):
    if len(word) < 3: return False
    for i in range(len(word)):
        if word[-1] == word[-2] and word[-1] == word[-3]:
            return True
    return False

def doesNotHaveAllowed(word, charArray, depth):
    if len(word) != depth:
        return False
    return word[depth - 1] not in charArray

def hasForbiddenDoubles(word, charArray):
    if len(word) < 2: return False
    for i in range(len(word)):
        if word[-1] == word[-2] and word[-1] not in charArray:
            return True
    return False

def doublesDontHaveVowelBefore(word):
    vowels = ['a', 'e', 'i', 'o', 'u', 'y']
    if len(word) < 3: return False
    for i in range(len(word)):
        if word[-1] == word[-2] and word[-1] in vowels and word[-3] in vowels:
            return True
        if word[-1] == word[-2] and word[-1] not in vowels and word[-3] not in vowels:
            return True
    return False

def hasInvalidInitConsonantCluster(word):
    clusters = ['thr'] # thr might be only valid english initial consonant cluster that starts with T
    vowels = ['a', 'e', 'i', 'o', 'u', 'y']
    if len(word) != 3: return False
    hasVowel = False
    for c in word:
        if c in vowels: hasVowel = True
    if hasVowel: return False
    return word not in clusters

    

In [7]:
import json
from collections import deque

BLOCKLIST ,MAX_IDX = ['d','l','h','f','t','i','k'], 2
# ALLOWED_AT_2ND, n2 = ['r', 'n', 'u'], 2
ALLOWED_AT_2ND, n2 = ['y', 'r', 'u', 'a', 'w', 'v', 'n'], 2
ALLOWED_AT_3RD, n3 = ['a', 'e', 'i', 'o', 'u', 'y'], 3
ALLOWED_DOUBLES = ['l', 's', 'r', 'e', 'o', 't', 'f', 'n', 'p', 'c']

'''
We will create names that fit within certain bounds, we import the character files because characters 
seem to write closer to things that hang over them i.e. "T" and "f", we are wishful on the f, and only 
provide different spacing for the second character (after the T)

2 character maps lead to much better accuracy, which could be phenomenal or terrible
'''

genNameSet = set()
with open(CHAR_JSON) as charFile:
    with open(CHAR_2ND_JSON) as secondCharFile:
        charMap = json.load(charFile)
        secondCharMap = json.load(secondCharFile)
        
        startLength = charMap['T']
        stack = deque()
        extendStack(stack, freqMap.get('t'), 1)
        curChars = 'T'
        print(stack)
        genNames = open('./genNames.csv', 'w')
        while len(stack) > 0:
            current, idx = stack.pop() # for clarity, as its DFS, idx here is pretty much = to len of the word, or the depth
            curChars = curChars[:idx] # recopy word upto length of current depth - 1, practically the pop back
            curChars += current # add next letter to make 'new' name
            if current in BLOCKLIST and idx <= MAX_IDX: # 2nd characters which would have too much height and likely would not fit, thereby skip
                pass
            elif isMoreThanTwoRepeatChars(curChars): # Skip if more than 2 repeating characters, very unlikely, 
                pass
            elif doesNotHaveAllowed(curChars, ALLOWED_AT_2ND, n2): # Likely characters at 2nd position, skip if otherwise 
                pass
            elif hasForbiddenDoubles(curChars, ALLOWED_DOUBLES): # Skip if contains consecutive characters not present in array, due to unlikelihood
                pass
            elif doublesDontHaveVowelBefore(curChars): # Skip if doubles dont have vowel beforee them, generally invalid
                pass
            elif hasInvalidInitConsonantCluster(curChars):
                pass
            else: # if didnt fall through above, check if good name or requires more letters, do nothing if too long, will pop back at top of loop
                curLength = getNameLength(curChars, charMap, secondCharMap) # get name length in pixels
                if isWithinBound(curLength): # within, potentially good name add to set
                    genNames.write(curChars + '\n')
                    genNameSet.add(curChars.upper())
                elif isUnderBound(curLength): # not long enough, go for another
                    extendStack(stack, freqMap.get(current), idx + 1)

        print(len(genNameSet))
        genNames.close()


deque([('h', 1), ('i', 1), ('a', 1), ('e', 1), ('t', 1), ('c', 1), ('s', 1), ('u', 1), ('o', 1), ('l', 1), ('r', 1), ('k', 1), ('g', 1), ('z', 1), ('m', 1), ('y', 1), ('n', 1), ('f', 1), ('w', 1), ('b', 1), ('j', 1), ('p', 1), ('v', 1), ('q', 1), ('d', 1)])
7821305


## Adjusting

So the above code was my naive attempt to generate a name by building a frequency map of which characters appeared after __T__. But as names can vary incredibly, pretty much every character appears to some extent after __T__ so it wasnt too much help. After that I tried to narrow it down by adding some generation rules (got a lil lazy so didnt go too crazy), to keep names within the bounds described in [Name Construction](#Name-Construction-using-DFS)

Obviously, as you can tell, by the results above that didnt work LOL as I ended with an obscene amount of results, so below I just intersected with US Census data, and prayed my friends Filipino middle name would be present.

For other context, I gave myself restrictions, such as not allowing myself to use Filipino data or naming conventions, which is why I tried to stick to some linguistic rules, but after I decided it would be quicker/easier/riskier to use Census data, I stopped adding rules, cause the intersection pretty much rendered the above pointless.


In [8]:
with open(CENSUS_T_NAMES) as censusTNames:
    with open('./genNames.csv') as genNames:
        censusSet = set()
        genNamesSet = set()
        for name in censusTNames:
            censusSet.add(name.strip().upper())
        for name in genNames:
            genNamesSet.add(name.strip().upper())
        intersection = censusSet.intersection(genNamesSet)
        print(len(intersection))
        print(sorted(intersection))


239
['TABACK', 'TABAHA', 'TABAKA', 'TABANO', 'TABEEK', 'TABONE', 'TABORA', 'TABORN', 'TABRON', 'TABULLO', 'TABUYO', 'TACHELL', 'TACKES', 'TAEGER', 'TAGAMI', 'TAGLIERI', 'TAMAKI', 'TAMASI', 'TAMILIO', 'TAMILLO', 'TAMIMI', 'TAMKIN', 'TAMLYN', 'TAMRAT', 'TANABE', 'TANADA', 'TANAKA', 'TANANA', 'TANASE', 'TANDOC', 'TANEDO', 'TANEGA', 'TANGER', 'TANIELU', 'TANNER', 'TANNEY', 'TANOUE', 'TANOUS', 'TANSEY', 'TANSKY', 'TANTALO', 'TANTILLO', 'TANTON', 'TANZINI', 'TAPAHA', 'TAPELLA', 'TAPKEN', 'TAPPER', 'TARBERT', 'TARGETT', 'TARLTON', 'TARNOFF', 'TARONE', 'TARPLEY', 'TARRANT', 'TARRATS', 'TARROW', 'TARSHIS', 'TARTARO', 'TARZIAN', 'TASAKA', 'TASCHE', 'TASCON', 'TASHIRO', 'TASKEY', 'TASSELL', 'TASSEY', 'TASSLER', 'TASSON', 'TASSOS', 'TAUBER', 'TAUBITZ', 'TAURIAC', 'TAUSCH', 'TAVANA', 'TAVANO', 'TAVEIRA', 'TAVELLA', 'TAVITAS', 'TAWEEL', 'TAWIAH', 'TAWZER', 'TAYNOR', 'TAYRIEN', 'TAYSON', 'TRABERT', 'TRABUE', 'TRAFFAS', 'TRAFTON', 'TRAHAN', 'TRAHEY', 'TRAINER', 'TRAINOR', 'TRAMELL', 'TRANBY', 'TRANTER

### Begin making screenshots

So after this I placed these intersections into a JSON file, and made a quick Next,js project and Selenium script. Their available in another part of the repo.
That part basically displayed the image on the local site and cycled through the images through a button. The script took screenshots of the original obscured image below the font approximated name from the JSON file which we would use below for the calculations.

## Finding out image brightness and map stuff



In [13]:
import numpy as np
import json

imageDir = './results'
jsonFile = './tNamesWLength.json'

from imageio.v2 import imread
from scipy.linalg import norm
from scipy import average

def compare_images(img1, img2):
    diff = img1 - img2  # elementwise for scipy arrays
    m_norm = np.sum(abs(diff))  # Manhattan norm
    z_norm = norm(diff.ravel(), 0)  # Zero norm <- amt of pixels that have any difference
    return (m_norm, z_norm)

### Heatmap Generation

This one below is a quick script to generate a heatmap visual for use in my presentation, get_arr_difs differs lightly to the compare_images func above as we just dont sum the differences since we want to keep it as an array to save the image.

In [14]:
from imageio.v2 import imwrite

def get_arr_difs(img1, img2):
    diff = img1 - img2  # elementwise for scipy arrays
    m_norm_arr = abs(diff)  # Manhattan norm
    z_norm_arr = norm(diff.ravel(), 0)  # Zero norm
    return (m_norm_arr, z_norm_arr)

with open(jsonFile) as namesFile:
    manhattanURL = './manhat-heatmap.png'
    zeroURL = './zero-heatmap.png'
    namesData = json.load(namesFile)
    namesArr = namesData['names']
    dataArr = []
    for i in range(0,1):
        name = namesArr[i]
        url_raw = imageDir + '/' + str(name) + '-init.png'
        url_mod = imageDir + '/' + str(name) + '.png'
        
        img1 = imread(url_raw).astype(np.uint8)
        img1_1 = imread(url_mod).astype(np.uint8)
        
        n_m, n_0 = get_arr_difs(img1, img1_1)
        
        
        img_manh = np.array(img1)
        img_zero = np.array(img1)
        for x in range(n_m.shape[0]):
            for y in range(n_m.shape[1]):
                pixel = n_m[x,y]
                if sum(pixel) != 0:
                    manh_dif = 1 + sum(pixel / 255)
                    manh_pixel = img_manh[x,y]
                    new_manh_pix = np.array([int(min(manh_pixel[0] * manh_dif, 255)), int(manh_pixel[1] / manh_dif), int(manh_pixel[2] / manh_dif), manh_pixel[3]])
                    zero_pix = np.array([255, 0, 0, 255])
                    img_manh[x,y] = new_manh_pix
                    img_zero[x,y] = zero_pix

        imwrite(manhattanURL, img_manh)
        imwrite(zeroURL, img_zero)

### Normalize FreqMap (Unused)

In [15]:
import math

counter = 0
maximum =  float('-inf')
minimum = float('inf')
normFreqTMap = {}
for key, value in freqMap['t'].items():
    currLn = math.log(value)
    normFreqTMap[key] = currLn
    counter += currLn
    minimum = min(minimum, currLn)
    maximum = max(maximum, currLn)
print(counter, minimum, maximum)

# min-max normalization
for key, value in normFreqTMap.items():
    normFreqTMap[key] = (normFreqTMap[key] - minimum) / (maximum - minimum)
    
print(normFreqTMap)

153.39853544218698 1.3862943611198906 8.970177815492379
{'h': 0.890690827855914, 'i': 0.9057726603384674, 'a': 0.9354356364527956, 'e': 1.0, 't': 0.9458467796404353, 'c': 0.6435892925816766, 's': 0.76772091921896, 'u': 0.7567348265461551, 'o': 0.9451624568564722, 'l': 0.708929030642237, 'r': 0.8829391871714104, 'k': 0.6085431750906473, 'g': 0.43211960636113655, 'z': 0.8135849720584737, 'm': 0.6329098135089631, 'y': 0.6637365404297018, 'n': 0.5980160541508727, 'f': 0.39501296289393806, 'w': 0.5441980233313833, 'b': 0.41627754964113334, 'j': 0.40758042656578214, 'p': 0.23625883493700264, 'v': 0.32765622846634435, 'q': 0.0, 'd': 0.19832548928598026}


## Compare images

In [16]:
with open(jsonFile) as namesFile:
    namesData = json.load(namesFile)
    namesArr = namesData['names']
    titles = np.array(['name', 'manhat_pp', 'zero_pp', 'manhat_pp_sm', 'zero_pp_sm'])
    dataArr = []
    for name in namesArr:
        url_raw = imageDir + '/' + str(name) + '-init.png'
        url_mod = imageDir + '/' + str(name) + '.png'
        url_raw_sm = imageDir + '/' + str(name) + '-init-sm.png'
        url_mod_sm = imageDir + '/' + str(name) + '-sm.png'
        
        img1 = imread(url_raw).astype(float)
        img1_1 = imread(url_mod).astype(float)
        
        img1_sm = imread(url_raw_sm).astype(float)
        img1_1_sm = imread(url_mod_sm).astype(float)
        
        n_m, n_0 = compare_images(img1, img1_1)
        n_m_sm, n_0_sm = compare_images(img1_sm, img1_1_sm)
        dataArr.append([name, n_m/img1.size, n_0*1.0/img1.size, n_m_sm/img1_sm.size, n_0_sm*1.0/img1_sm.size])
npDataArr = np.array(dataArr)

Quick Helper functions below to normalize the columns

In [8]:
def mean(arr):
    z = 0.0
    for i in arr:
        z += float(i)
    return z / len(arr)


def minMax_normalize_column(npArr):
    newArr = np.array(npArr).astype(float)
    minimum = float(min(newArr))
    maximum = float(max(newArr))
    for i in range(len(npArr)):
        currVal = float(newArr[i])
        newArr[i] = (currVal - minimum) / (maximum - minimum)
    return newArr

# arr = np.array([1.,6.,2.,3.,4.,5.])
# minMax_normalize_column(arr)
# print(arr)

In [17]:
normedArr = np.array(npDataArr)
for i in range(len(titles)):
    if i > 0:
        normedArr[:, i] = minMax_normalize_column(npDataArr[:, i])

df = pd.DataFrame( { titles[x]: normedArr[:, x] for x in range(len(titles))})
df

Unnamed: 0,name,manhat_pp,zero_pp,manhat_pp_sm,zero_pp_sm
0,TREBON,0.368729074594884,0.36151603498542245,0.13425270509216022,0.5247721404453465
1,TRAVIOLI,0.3724253381545463,0.696793002915452,0.28852621501049114,0.7479310573124708
2,TRANTHI,0.12855229677246516,0.7725947521865889,0.19793471576657767,0.6380950279169336
3,TREUTEL,0.046189902236507235,0.4693877551020409,0.15954412023943598,0.5613841502438596
4,TRAVICK,0.20978974152939608,0.5043731778425657,0.1198137186938975,0.6102001633085423
...,...,...,...,...,...
196,TYNDALL,0.254051158430427,0.6929057337220602,0.3722434787600136,0.7601350605786403
197,TAMRAT,0.0012856568903172121,0.09815354713313901,0.08366987479760706,0.3696069560611745
198,TURVILLE,0.2388093264371672,0.7653368761609525,0.04765916352466215,0.5794253304791122
199,TANGER,0.0009240658899155805,0.0719144800777451,0.07187958183571379,0.13075717785183066


### Lookin at info

* manhat_pp: Lower is Better, its sum of abs difference between pixels
* zero_pp: Lower is Better, sum of pixel differences

### Finding end distance
Here we roughly find the distance from the end of the black stroke in the picture to the green generated name, to find out how far we are from the edge.
- find rightmost greenpixel, rightmost black pixel, find difference in say x position and use that to determine how far from edge
- can start from right, go left, jot down first black or green pixel

In [6]:
# Colors in RGBA
GREEN = [59, 128, 66, 255]
BLACK = [0, 0, 0, 255]
WHITE = [255, 255, 255, 255]
TOLERANCE = 20

greenImageDir = './results-green'

from imageio.v2 import imwrite

def euclideanDistance(rgbArr, boundArr):
    x1 = np.array(rgbArr[:3])
    x2 = np.array(boundArr[:3])
    return ((x1[0] - x2[0]) ** 2) + ((x1[1] - x2[1]) ** 2) + ((x1[2] - x2[2]) ** 2) < TOLERANCE

def findRightMostColors(img):
    currGreen = None
    currBlack = None
    for x in range(img.shape[1]-1, 0, -1):
        for y in range(img.shape[0]-1, 0, -1): 
            if currGreen and currBlack:
                break
            curPix = img[y,x]
            if not currGreen and euclideanDistance(curPix, GREEN):
                currGreen = (y,x)
            if not currBlack and euclideanDistance(curPix, BLACK):
                currBlack = (y,x)
    return currBlack[1] - currGreen[1] # distance from rightmost black to rightmost green

# Quick test to verify it works
# def test():
#     test_writeFile = greenImageDir + '/' + 'CHUMMER' + '.png'
    # green_uri_l = greenImageDir + '/' + 'TYACKE-sm' + '.png'
    # cur_img = imread(green_uri_l).astype(float)
#     test_img = cur_img[:, :len(cur_img[0]) // 2, :] # up-down, left-right, color
#     imwrite(test_writeFile, test_img)     
# test()
    
dataArr = []
titles_dist = np.array(['name',  'dist', 'dist^2', 'dist_sm', 'dist_sm^2'])
with open(jsonFile) as namesFile:
    namesData = json.load(namesFile)
    namesArr = namesData['names']
    for name in namesArr:
        green_big = greenImageDir + '/' + str(name) + '.png'
        green_sm = greenImageDir + '/' + str(name) + '-sm.png'
        cur_img = imread(green_big).astype(float)
        cur_img_sm = imread(green_sm).astype(float)
        dist = findRightMostColors(cur_img)
        dist_sm = findRightMostColors(cur_img_sm)
        item = [name, abs(dist), dist**2, abs(dist_sm), dist_sm**2]
        dataArr.append(item)
    

Next we have to 
- merge pd datatables
- all are 'lower is better', so we can subtract from 1 i.e. (1-x) to get a dif
- can then add together, after multiplying by a set of bias weights or something depending on the variable

In [13]:
titles_dist = np.array(['name',  'dist', 'dist^2', 'dist_sm', 'dist_sm^2'])
normed_dist_arr = np.array(dataArr)
for i in range(len(titles_dist)):
    if i > 0:
        normed_dist_arr[:, i] = minMax_normalize_column(normed_dist_arr[:, i])
arr = np.array(dataArr)
df2 = pd.DataFrame( { titles_dist[x]: normed_dist_arr[:, x] for x in range(len(titles_dist))})
df2

Unnamed: 0,name,dist,dist^2,dist_sm,dist_sm^2
0,TREBON,0.684210526,0.52,0.428571428,0.183673469
1,TRAVIOLI,0.473684210,0.284210526,0.571428571,0.326530612
2,TRANTHI,0.578947368,0.393684210,0.285714285,0.081632653
3,TREUTEL,0.736842105,0.589473684,0.571428571,0.326530612
4,TRAVICK,0.157894736,0.056842105,0.142857142,0.020408163
...,...,...,...,...,...
196,TYNDALL,0.421052631,0.235789473,0.714285714,0.510204081
197,TAMRAT,0.578947368,0.393684210,0.285714285,0.081632653
198,TURVILLE,0.210526315,0.084210526,0.142857142,0.020408163
199,TANGER,0.578947368,0.393684210,0.285714285,0.081632653


In [18]:
frames = [df, df2]

concatted_df = df.join(df2.set_index('name'), on='name')
# concatted_df.sort_values('dist_sm').to_csv('./data.csv')
concatted_df.sort_values('dist_sm')

Unnamed: 0,name,manhat_pp,zero_pp,manhat_pp_sm,zero_pp_sm,dist,dist^2,dist_sm,dist_sm^2
62,TURNIER,0.19433507432703875,0.2623906705539355,0.13957070436518074,0.24756692339946562,0.421052631,0.235789473,0.0,0.0
180,TURKETT,0.4227936252845852,0.4227405247813412,0.2647498112840176,0.6468121731070554,0.368421052,0.191578947,0.0,0.0
77,TRASTER,0.0411276282308826,0.3586005830903791,0.12712081459423238,0.3173040849204418,0.368421052,0.191578947,0.0,0.0
70,TANASE,0.0007633587786258166,0.060252672497570325,0.1282751090100813,0.4341138304680768,0.315789473,0.151578947,0.0,0.0
61,TREMER,0.037190304004285406,0.1049562682215741,0.13003746923427953,0.30510008165427227,0.368421052,0.191578947,0.0,0.0
...,...,...,...,...,...,...,...,...,...
14,TUCHEK,0.3148617745630723,0.4423454825823975,0.42007871511084643,0.5609716670329711,0.0,0.0,0.857142857,0.734693877
30,TROYAN,0.037431364671219824,0.12244897959183652,0.06221236610297374,0.10983602939553713,0.947368421,0.909473684,0.857142857,0.734693877
141,TARBERT,0.3423023870751601,0.5128405575840067,0.5973534854629896,0.6995660258916919,0.105263157,0.033684210,0.857142857,0.734693877
85,TRUFANT,0.6867778563230957,0.5037765558302887,0.24762423333367564,0.8105676075298477,0.210526315,0.084210526,0.857142857,0.734693877


### Finding the results

Here we establish a bias and do some calculations, and add personal preference stuff

I would be lying through omission if I didnt say I modified some weights for increased humor value.

In [94]:
'''
        manhat_pp, zero_pp, manhat_pp_sm, zero_pp_sm, dist, dist^2, dist_sm, dist_sm^2
'''
weights = np.array([.9,       .4,         1.,        .7,    1.,    0.0,     1.1,      0.0])
SECOND_LETTER_BIAS = {'U': 1.1,'Y': 1.4, 'R': 1.1}
LAST_LETTER_BIAS = {'L': .85, 'I': .85, 'T': .85}

def addLetterBias(name, value):
    score = float(value)
    secBias = SECOND_LETTER_BIAS.get(name[1], 1.0)
    lastBias = LAST_LETTER_BIAS.get(name[-1], 1.0)
    return score * secBias * lastBias

resultArr = []
for i in range(len(concatted_df)):
    row = concatted_df.iloc[i].to_numpy()
    vals = row[1:].astype(float)
    highToLow = np.ones(len(vals)) - vals
    resultArr.append([row[0], np.sum(weights * highToLow)])
    
resultArr = np.array(resultArr)

In [95]:
resultTitles = ['name', 'score']
resultDf = pd.DataFrame({ resultTitles[x]: resultArr[:, x] for x in range(len(resultTitles))})
resultDf.sort_values('score', ascending=False)

Unnamed: 0,name,score
70,TANASE,4.327267644762474
61,TREMER,4.312517640715244
91,TANSEY,4.306006364356435
103,TRUSTER,4.250799572467277
189,TURMER,4.230748570583666
...,...,...
112,TRAFFAS,2.216050012749897
71,TRABERT,2.2080257592548955
53,TWOHIG,2.1639179723087625
132,TURIELLO,2.079468314136145


In [96]:
resultDf['biasScore'] = resultDf.apply(lambda row: addLetterBias(row['name'],row['score']), axis=1)
# resultDf.sort_values('biasScore', ascending=False).to_csv('./scores.csv')
resultDf.sort_values('biasScore', ascending=False).head(11)

Unnamed: 0,name,score,biasScore
140,TYACKE,4.037269721342076,5.652178
147,TYRONE,3.4033787728640164,4.76473
61,TREMER,4.312517640715244,4.743769
138,TYSZKO,3.3833333685124307,4.736667
103,TRUSTER,4.250799572467277,4.67588
189,TURMER,4.230748570583666,4.653823
93,TYGRETT,3.905299751785116,4.647307
97,TUNNER,4.206896482563154,4.627586
77,TRASTER,4.201890175317512,4.622079
175,TRESTER,4.184379169187607,4.602817
