In [1]:
# Random line is about Robert

prob_member_in_race = .01
result_count = 1000

prior_prob = prob_member_in_race / result_count
f"{prior_prob:.5f}"

'0.00001'

In [2]:
# Our Robert leads to "Robert"

prob_name_right = .6
prob_name_right

0.6

In [3]:
# Someone else leads to "Robert"

import pandas as pd
from pooch import retrieve

def load_name_to_prob():
    name_prob_file = retrieve("https://github.com/CarlKCarlK/bayesmatch/releases/download/v0.0.1/name_probability.tsv",
        known_hash="md5:cf2b3277a9e88b846c18bed559a4fbea",
        fname="name_probability.tsv")
    name_to_prob_df = pd.read_csv(name_prob_file, sep="\t")
    name_to_prob_df.set_index("name", inplace=True)
    name_to_prob = name_to_prob_df.to_dict()["probability"]
    return name_to_prob

name_to_prob = load_name_to_prob()
prob_coincidence = name_to_prob["ROBERT"]
prob_coincidence

0.03143

In [4]:
# "Robert" is from Robert

import numpy as np

def logodds(prob):
    return np.log(prob / (1.0 - prob))

def prob(logodds):
    odds = np.exp(logodds)
    prob = odds / (odds + 1.0)
    return prob

prior_points = logodds(prior_prob)
print(f"prior: {prior_points:.2f} points, {prior_prob:.5f} probability")

delta_points = np.log(prob_name_right / prob_coincidence)
print(f"delta: {delta_points:.2f} points")

post_points = prior_points + delta_points
print(f"post: {post_points:.2f} points, {prob(post_points):.5f} probability")

prior: -11.51 points, 0.00001 probability
delta: 2.95 points
post: -8.56 points, 0.00019 probability


In [5]:
# No "Robert", but still from Robert

print(f"prior: {prior_points:.2f} points, {prior_prob:.5f} probability")

delta_points = np.log((1.0-prob_name_right) / (1.0-prob_coincidence))
print(f"delta: {delta_points:.2f} points")

post_points = prior_points + delta_points
print(f"post: {post_points:.2f} points, {prob(post_points):.6f} probability")

prior: -11.51 points, 0.00001 probability
delta: -0.88 points
post: -12.40 points, 0.000004 probability


In [6]:
# "Robert" and "Scott" is from Robert Scott.

print(f"prior: {prior_points:.2f} points, {prior_prob:.5f} probability")

first_name_points = np.log(prob_name_right / name_to_prob["ROBERT"])
print(f"first_name: {first_name_points:.2f} points")

last_name_points = np.log(prob_name_right / name_to_prob["SCOTT"])
print(f"last_name: {last_name_points:.2f} points")

post_points = prior_points + first_name_points + last_name_points
print(f"post: {post_points:.2f} points, {prob(post_points):.5f} probability")

prior: -11.51 points, 0.00001 probability
first_name: 2.95 points
last_name: 4.70 points
post: -3.86 points, 0.02055 probability


In [7]:
def delta(name, contains):
    prob_coincidence = name_to_prob[name]
    if contains:
        return np.log(prob_name_right / prob_coincidence)
    else:
        return np.log((1.0-prob_name_right) / (1.0-prob_coincidence))

first_name = "CHELLIE"
last_name = "PINGREE"
data_list = []
for contains_first_name in [False, True]:
    first_name_points = delta(first_name, contains_first_name)
    for contains_last_name in [False, True]:
        last_name_points = delta(last_name, contains_last_name)
        post_points = prior_points + first_name_points + last_name_points
        data = [contains_first_name,
                        contains_last_name,
                        prior_prob,
                        prior_points,
                        first_name_points,
                        last_name_points,
                        post_points,
                        prob(post_points),
        ]
        data_list.append(data)
df = pd.DataFrame(data_list)        
df.columns=[f"contains_{first_name}", f"contains_{last_name}", "prior prob", "prior points", "first name points", "last name points", "post points", "post prob"]        
df

Unnamed: 0,contains_CHELLIE,contains_PINGREE,prior prob,prior points,first name points,last name points,post points,post prob
0,False,False,1e-05,-11.512915,-0.91629,-0.916287,-13.345492,2e-06
1,False,True,1e-05,-11.512915,-0.91629,12.023751,-0.405454,0.400003
2,True,False,1e-05,-11.512915,13.368201,-0.916287,0.938998,0.718897
3,True,True,1e-05,-11.512915,13.368201,12.023751,13.879036,0.999999


In [8]:
# "Bob" is from Robert

def delta(name, prob_name_right, contains):
    prob_coincidence = name_to_prob[name]
    return delta_from_coincidence(prob_coincidence, prob_name_right, contains)
    
def delta_from_coincidence(prob_coincidence, prob_name_right, contains):
    if contains:
        return np.log(prob_name_right / prob_coincidence)
    else:
        return np.log((1.0-prob_name_right) / (1.0-prob_coincidence))



print(f"prior: {prior_points:.2f} points, {prior_prob:.5f} probability")

first_name_points = np.NINF
for name, prob_name_right_0, contains in [("ROBERT", 0.50, True), ("BOB", 0.05, True), ("ROB", 0.05, False)]:
    some_first_name_points = delta(name, prob_name_right_0, contains)
    print(f"\t{name}: {some_first_name_points:.2f} points")
    first_name_points = np.max([first_name_points, some_first_name_points])
print(f"first_name: {first_name_points:.2f} points")

last_name_points = delta("SCOTT", prob_name_right, contains=True)
print(f"last_name: {last_name_points:.2f} points")

post_points = prior_points + first_name_points + last_name_points
print(f"post: {post_points:.2f} points, {prob(post_points):.5f} probability")

prior: -11.51 points, 0.00001 probability
	ROBERT: 2.77 points
	BOB: 4.51 points
	ROB: -0.05 points
first_name: 4.51 points
last_name: 4.70 points
post: -2.30 points, 0.09083 probability


In [9]:
# "Bellevue" refers to Robert Scott's town.

print(f"prior: {prior_points:.2f} points, {prior_prob:.5f} probability")

first_name_points = np.NINF
for name, prob_name_right_0, contains in [("ROBERT", 0.50, True), ("BOB", 0.05, True), ("ROB", 0.05, False)]:
    some_first_name_points = delta(name, prob_name_right_0, contains)
    print(f"\t{name}: {some_first_name_points:.2f} points")
    first_name_points = np.max([first_name_points, some_first_name_points])
print(f"first_name: {first_name_points:.2f} points")

last_name_points = delta("SCOTT", prob_name_right, contains=True)
print(f"last_name: {last_name_points:.2f} points")

city_by_conincidence = (170+1)/(1592+2)
city_name_points = delta_from_coincidence(city_by_conincidence, prob_name_right, contains=True)
print(f"city: {city_name_points:.2f} points")

post_points = prior_points + first_name_points + last_name_points + city_name_points
print(f"post: {post_points:.2f} points, {prob(post_points):.5f} probability")

prior: -11.51 points, 0.00001 probability
	ROBERT: 2.77 points
	BOB: 4.51 points
	ROB: -0.05 points
first_name: 4.51 points
last_name: 4.70 points
city: 1.72 points
post: -0.58 points, 0.35846 probability


In [10]:
# We expect to see Bellevue, but we don't
city_name_points = delta_from_coincidence(city_by_conincidence, prob_name_right, contains=False)
print(f"city: {city_name_points:.2f} points")

post_points = prior_points + first_name_points + last_name_points + city_name_points
print(f"post: {post_points:.2f} points, {prob(post_points):.5f} probability")

city: -0.80 points
post: -3.11 points, 0.04284 probability
