A notebook to evalute various estimators of the number of distinct classes in a population. 

Intended to supplement the readings *On the Estimation of the Number of Classes* in a Population by Leo A Goodman and *The Population Frequencies of Species and the Estimation of Population Parameters* by I. J. Good

In [None]:
from __future__ import division
import numpy as np
from scipy.misc import comb
from numpy import random as np_random
from collections import Counter
from random import sample, choice
from math import factorial, log, exp
import fileinput
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# make a population with classes that are Poisson distributed
l = 10
pop_size = 10000
population = list(np_random.poisson(lam = l, size = pop_size))
percent_sample = 1

print "For a {}% sample of a Poisson({}) population of size {}: \n".format(percent_sample, l, pop_size)

In [None]:
# make a population with classes that are Poisson distributed
pop_size = 10000
population = [choice(range(0,100)) for x in range(0,pop_size)]
percent_sample = 1

print "For a {}% sample of a Uniform (integers 1 through 100) population of size {}: \n".format(percent_sample, l, pop_size)

In [None]:
# generate samples and print out info about the population
print "\n The exact population demographics:"
pop_demographic = sorted(Counter(population).items(), key=lambda x: x[0])
for key, value in pop_demographic:
    print "    {} items of class {}".format(str(value).ljust(5), key)

print "\n The demographics of the hypergeometric sample population:"
exactly_10_percent_sample = sample(population,int(pop_size*percent_sample/100))
n = len(exactly_10_percent_sample)
exactly_10_demographic_sample_only = Counter(exactly_10_percent_sample)
exactly_10_demographic = Counter(exactly_10_percent_sample)
for key in [x[0] for x in pop_demographic]:
    if key not in exactly_10_demographic.keys():
        exactly_10_demographic[key] = 0
exactly_10_demographic = sorted(exactly_10_demographic.items(), key=lambda x: x[0])
for key, value in exactly_10_demographic:
    print "    {} items of class {}".format(str(value).ljust(5), key)
print " If the statistic x_{i} is the number of classes with i elements in the sample:"
x_counts_exact = sorted(Counter([x[1] for x in exactly_10_demographic]).items(), key=lambda x: x[0])
for count, val in x_counts_exact:
    print "    {} classes appear {} times in the sample, x_{} = {}".format(val, count, count, val)
print " So the observable vector x for this sample is " 
print " (recall that the number of classes appearing 0 times in the sample is not observable):"
x_vec = [0 for x in range(0,max([x[0] for x in x_counts_exact]))]
for x in x_counts_exact:
    if x[0] != 0:
        x_vec[x[0]-1] = x[1]
x_padded = np.pad(np.array(x_vec),(0,n-len(x_vec)),mode='constant',constant_values=0)
print x_vec

In [None]:
fig, ax = plt.subplots()
ax.bar(range(1,len(x_vec) + 1), x_vec, .45)
ax.bar([0.05], [x_counts_exact[0][1]], .45, color = 'gray')
ax.set_xticks([t + .22 for t in range(0,len(x_vec) + 1)])
ax.set_xticklabels(range(0,len(x_vec) + 1))
plt.ylabel("# of classes appearing x times in the sample")
plt.xlabel("x")
#plt.title("Histogram of x")

In [None]:
'''
eq_matrix = np.zeros((n,n))
for i in xrange(0,n+1):
    for j in xrange(0,n+1):
        eq_matrix[i,j] = comb(i,j)*comb((pop_size - j), (n - i))/comb(pop_size, n)

s_vec = np.linalg.solve(eq_matrix,x_padded)
sum(s_vec)
'''

In [None]:
# calculate the unbiased estimator S
S = 0
for i in xrange(1, n + 1):
    x_i = x_padded[i - 1]
    if x_i != 0:
        sign = (-1)**(i+1)
        log_numerator = sum([log(h) for h in range(pop_size - n, pop_size - n + i - 1 + 1)])
        log_denominator = sum([log(k) for k in range(n - i, n - i + 1)])
        # print x_i + sign*exp(log_numerator/log_denominator)*x_i
        S += x_i + sign*exp(log_numerator/log_denominator)*x_i
print "The estimated number of total classes in the population is {}".format(S)

In [None]:
pop_size

In [None]:
# calculate the biased estimator S'
S_1 = pop_size - ((pop_size)*(pop_size - 1))/((n)*(n - 1))*x_padded[2 - 1]
print "The estimator S' gives {} as the total number of classes in the population".format(S_1)

In [None]:
# calculate S''
S_2 = (pop_size/n) * sum(x_padded)
print "The estimator S'' gives {} as the total number of classes in the population".format(S_2)

In [None]:
# calculate S'' (2)
S_3 = sum(x_padded)
print "The estimator S'' gives {} as the total number of classes in the population".format(S_3)

In [None]:
# From I.J. Good's paper
S_4 = (1/(1 - x_padded[0]/n))*(sum(x_padded))
print "The estimator suggested by I.J. Good gives {} as the number of classes in the population".format(S_4)

In [None]:
print "Reminder: the actual number of classes in the population is {}".format(len(set(population)))

In [None]:
tweet_pop = []
for line in fileinput.FileInput("craft_users_tweeting_soda_usernames.txt"):
    tweet_pop.append(line.strip())
tweet_pop_size = len(tweet_pop)
num_uniq_users_in_pop = len(set(tweet_pop))
tweet_pop_demographic = sorted(Counter(tweet_pop).items(), key=lambda x: x[0])

In [None]:
## Sample size, generate a new sample
tweet_n = 4000
sample_tweet_pop = sample(tweet_pop, tweet_n)

In [None]:
sample_tweet_pop_demographic = Counter(sample_tweet_pop)
for key in [x[0] for x in tweet_pop_demographic]:
    if key not in sample_tweet_pop_demographic.keys():
        sample_tweet_pop_demographic[key] = 0
sample_tweet_pop_demographic = sorted(sample_tweet_pop_demographic.items(), key=lambda x: x[0])

print " If the statistic x_{i} is the number of classes with i elements in the sample:"
x_counts_tweets = sorted(Counter([x[1] for x in sample_tweet_pop_demographic]).items(), key=lambda x: x[0])
for count, val in x_counts_tweets:
    print "    {} classes appear {} times in the sample, x_{} = {}".format(val, count, count, val)
print " So the observable vector x for this sample is " 
print " (recall that the number of classes appearing 0 times in the sample is not observable):"
x_tweets_sample = [0 for x in range(0,max([x[0] for x in x_counts_tweets]))]
for x in x_counts_tweets:
    if x[0] != 0:
        x_tweets_sample[x[0]-1] = x[1]
x_tweets = np.pad(np.array(x_tweets_sample),(0,tweet_n-len(x_tweets_sample)),mode='constant',constant_values=0)
print x_tweets_sample

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.bar(range(0,len(x_counts_tweets)), [t[1] for t in x_counts_tweets], .45)
#ax.bar([0.05], [x_counts_exact[0][1]], .45, color = 'gray')
ax.set_xticks([t + .22 for t in range(0,len(x_tweets_sample) + 1)])
ax.set_xticklabels(range(0,len(x_tweets_sample) + 1))
plt.ylabel("# of classes appearing x times in the sample")
plt.xlabel("x (unobservable, because x_0, the number of unobserved classes, is included)")
plt.title = ("Histogram of x")

fig, ax = plt.subplots(figsize=(10,6))
ax.bar(range(0,len(x_tweets_sample)), x_tweets_sample, .45)
#ax.bar([0.05], [x_counts_exact[0][1]], .45, color = 'gray')
ax.set_xticks([t + .22 for t in range(0,len(x_tweets_sample) + 1)])
ax.set_xticklabels(range(0,len(x_tweets_sample) + 1))
plt.ylabel("# of classes appearing x times in the sample")
plt.xlabel("x oberseved")
plt.title = ("Histogram of x")

In [None]:
# calculate the unbiased estimator S
tweet_S = 0
test = []
for i in xrange(1, tweet_n + 1):
    x_i = x_tweets[i - 1]
    if x_i != 0:
        sign = (-1)**(i+1)
        log_numerator = sum([log(h) for h in range(tweet_pop_size - tweet_n, tweet_pop_size - tweet_n + i - 1 + 1)])
        log_denominator = sum([log(k) for k in range(tweet_n - i, tweet_n - i + 1)])
        test.append(x_i + sign*exp(log_numerator/log_denominator)*x_i)
        tweet_S += x_i + sign*exp(log_numerator/log_denominator)*x_i
    else:
        break
print "The estimated number of total classes in the population is {}".format(tweet_S)

In [None]:
# calculate the biased estimator S'
tweet_S_1 = tweet_pop_size - ((tweet_pop_size)*(tweet_pop_size - 1))/((tweet_n)*(tweet_n - 1))*x_tweets[2 - 1]
print "The estimator S' gives {} as the total number of classes in the population".format(tweet_S_1)

In [None]:
# calculate S''
tweet_S_2 = (tweet_pop_size/tweet_n) * sum(x_tweets)
print "The estimator S'' gives {} as the total number of classes in the population".format(tweet_S_2)

In [None]:
# calculate S'' (2)
tweet_S_3 = sum(x_tweets)
print "The estimator S'' gives {} as the total number of classes in the population".format(tweet_S_3)

In [None]:
# From I.J. Good's paper
tweet_S_4 = (1/(1 - x_tweets[0]/tweet_n))*(sum(x_tweets))
print "The estimator suggested by I.J. Good gives {} as the number of classes in the population".format(tweet_S_4)

In [None]:
print "REminder: the actual number of classes (unique users) in the population is: {}".format(num_uniq_users_in_pop)

In [None]:
first_cutoff = np.where(x_tweets==1)[0][0]
x_tweets_ = x_tweets[0:first_cutoff]
outliers = []
for i,k in enumerate(x_tweets[first_cutoff:]):
    if k != 0:
        outliers.append((i + first_cutoff + 1, k))
# assume that these more common classes were sampled pretty evenly
num_outliers_in_sample = sum([x[0]*x[1] for x in outliers])
num_outliers_in_pop = sum([x[0]*x[1]*tweet_pop_size/tweet_n for x in outliers])
num_outlier_classes = sum([x[1] for x in outliers])

tweet_pop_size_ = int(tweet_pop_size - num_outliers_in_pop)
tweet_n_ = int(tweet_n - num_outliers_in_sample)

In [None]:
# calculate the unbiased estimator S
tweet_S = 0
for i in xrange(1, len(x_tweets_) + 1):
    x_i = x_tweets_[i - 1]
    if x_i != 0:
        sign = (-1)**(i+1)
        log_numerator = sum([log(h) for h in range(tweet_pop_size_ - tweet_n_, tweet_pop_size_ - tweet_n_ + i - 1 + 1)])
        log_denominator = sum([log(k) for k in range(tweet_n_ - i, tweet_n_ - i + 1)])
        # print x_i + sign*exp(log_numerator/log_denominator)*x_i
        tweet_S += x_i + sign*exp(log_numerator/log_denominator)*x_i
    else:
        break
print "The estimated number of total classes in the population is {}".format(int(tweet_S + num_outlier_classes))

In [None]:
# Analysis of the classes that *do not* appear in the sample
users_not_in_sample = set([x[0] for x in sample_tweet_pop_demographic if x[1]==0])
users_not_in_sample_demographic = []
users_in_sample_demographic = []
for u in tweet_pop_demographic:
    if u[0] in users_not_in_sample:
        users_not_in_sample_demographic.append(u)
    else:
        users_in_sample_demographic.append(u)
counts_users_in_sample = Counter([x[1] for x in users_in_sample_demographic]).items()
counts_users_not_in_sample = Counter([x[1] for x in users_not_in_sample_demographic]).items()

x_users_in_sample = [0 for x in range(0,max([x[0] for x in counts_users_in_sample]))]
for x in counts_users_in_sample:
    if x[0] != 0:
        x_users_in_sample[x[0]-1] = x[1]

x_users_not_in_sample = [0 for x in range(0,max([x[0] for x in counts_users_not_in_sample]))]
for x in counts_users_not_in_sample:
    if x[0] != 0:
        x_users_not_in_sample[x[0]-1] = x[1]

print x_users_in_sample
print x_users_not_in_sample

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.bar(range(0,len(x_users_not_in_sample)), x_users_not_in_sample, .45)
ax.set_xticks([t + .22 for t in range(0,len(x_users_not_in_sample) + 1)])
ax.set_xticklabels(range(0,len(x_users_not_in_sample) + 1))
plt.ylabel("# of classes appearing x times in the pop and at least once in the sample")
plt.xlabel("x ")
plt.title = ("Histogram of x")


In [None]:
x_users_not_in_sample[0]/x_users_not_in_sample[1]