# Comparing collections (Part One)

* Set comparison
* Ordered collections
* Ranked collections
* Collection transformation

In [None]:
import random
import collections
import operator

import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
%matplotlib inline

import count_min

# some matplotlib color-mapping 
cmap = plt.get_cmap('viridis')
c_space = np.linspace(0,99,100)

# Set comparison

In [None]:
# create two sets of data scientists
a = set(['josh','fiona','scotty','skippy'])
b = set(['jeff','whitney','fiona'])

In [None]:
a.intersection(b)

In [None]:
c = set(range(10))
d = set(range(8,15))

In [None]:
c.intersection(d)

In [None]:
c.union(d)

# List comparison

## Number sequences

In [None]:
# create an ordered sequence of evenly-spaced integers
# create a second ordered seqence of intergers, which differs from the first by only noise
a = np.array([[i,i+np.random.normal()] for i in range(100)])

In [None]:
# the color variation shows the sequence order
plt.scatter(a[:,0],a[:,1],c=c_space,cmap=cmap)

In [None]:
# check the the correlation between the sequences (off-diagonal elements), 
# which will be high for small noise
np.corrcoef(a,rowvar=0)

In [None]:
# now create two-similarly related sequences, but with non-even spacing and larger noise
_ = [1,4,5,8,15,45,48,50,55,60,88,89,90,93,99]
b = np.array([[i,i+np.random.normal()*5] for i in _])

In [None]:
plt.scatter(b[:,0],b[:,1],c=np.linspace(0,99,len(b)),cmap=cmap)

In [None]:
# check the correlation
np.corrcoef(b,rowvar=0)

In [None]:
# now create randomly-ordered seqences with larger noise


_ = np.array([random.random()*100 for _ in range(100)])
c = np.array([[i,i+np.random.normal()*10] for i in _])

In [None]:
plt.scatter(c[:,0],c[:,1],c=c_space,cmap=cmap)

In [None]:
# the correlation coefficient is still relatively large
np.corrcoef(c,rowvar=0)

Try relating with the nosie scale and sparsity of the sequences with the correlation coefficient.


# Ordinal comparison

e.g. comparing rank

In [None]:
from search.api import Query
import json
import yaml
creds = yaml.load(open('/Users/jkolb/.creds.yaml'))

# set up a query to the Gnip Search API
q = Query(creds['username'],
          creds['password'],
          creds['search_endpoint'],
          paged=True,
          hard_max = 1000,
          search_v2 = True
          )

# query parameters
start_date = '2016-08-01T00:00'
end_date = '2016-09-01T00:00'
rule = 'mom'

# get the tweet data
q.execute(rule,start=start_date,end=end_date)
mom_tweets = list(q.get_activity_set())

In [None]:
def token_frequency(tweets,num_top_terms = 30,delta=10**-5,epsilon=0.001,):
    """
    Space-tokenize tweet bodies and 
    return exact and approximate 1-gram counts
    
    Approximate the counts with a count-min sketch
    """
    
    terms = collections.defaultdict(int)
    sketch = count_min.Sketch(dict(delta=delta,epsilon=epsilon,k=num_top_terms))
    for tweet in tweets:
        for token in tweet['body'].split():
            terms[token.lower()] += 1
            sketch.update(token.lower(),1)
    return (terms,sketch)

In [None]:
def display_two_lists(list_1,list_2):
    """helper function"""
    for x,y in zip(list_1,list_2):
        print(x,y)

In [None]:
num_top_terms = 30
# accuracy parameters for CM sketch
delta = 10**-4
epsilon = 0.01

In [None]:
# get exact and approximate top terms and counts
mom_terms,mom_sketch = token_frequency(mom_tweets,num_top_terms,delta,epsilon)
exact_top_mom_terms = list(reversed(sorted(mom_terms.items(),key = operator.itemgetter(1))))[:num_top_terms]
approx_top_mom_terms = [(term,count) for count,term in reversed(sorted(mom_sketch.top_k.values(),key = operator.itemgetter(0)))]

**Kendall's tau coefficient** is a sort of correlation coefficient that is proportional to the difference between the number of _concordant_ pairs and the number of _discordant_ pairs.  

In [None]:
kt_result = scipy.stats.kendalltau(exact_top_mom_terms,approx_top_mom_terms)
kt_result.correlation

In [None]:
display_two_lists(exact_top_mom_terms,approx_top_mom_terms)

A harder problem: how to account for the varying importance of rank?

# Over/under-indexing

In [None]:
# get some data around the term 'dad'

rule = 'dad'
q.execute(rule,start=start_date,end=end_date)
dad_tweets = list(q.get_activity_set())

In [None]:
dad_terms,dad_sketch = token_frequency(dad_tweets,num_top_terms,delta,epsilon)
exact_top_dad_terms = list(reversed(sorted(dad_terms.items(),key = operator.itemgetter(1))))[:num_top_terms]
approx_top_dad_terms = [(term,count) for count,term in reversed(sorted(dad_sketch.top_k.values(),key = operator.itemgetter(0)))]

In [None]:
display_two_lists(exact_top_dad_terms,exact_top_mom_terms)

But we don't really care about rank here. We care about removing the effect of a baseline.

In [None]:
def normalize_by_difference(term_counts,baseline_counts):
    """define a normalized term frequency that subtracts off a baseline count"""
    normed_term_counts = {}
    for term,count in term_counts.items():
        try:
            normed_term_counts[term] = count - baseline_counts[term]
        except KeyError:
            normed_term_counts[term] = count
    return normed_term_counts

In [None]:
normalized_results = normalize_by_difference(dad_terms,mom_terms)

# look at top of list to see most "dad"-like terms
list(reversed(sorted(normalized_results.items(), key = operator.itemgetter(1))))[:20]

In [None]:
# and look at the bottom of the list to see the mom-like terms
list(reversed(sorted(normalized_results.items(), key = operator.itemgetter(1))))[-20:]

But this normalization-by-difference only works if the two term frequency distributions have the same scale of counts. 

In [None]:
def normalize_by_fraction(term_counts,baseline_counts):
    """normalize counts by the difference of the term-fractions for each distribution"""
    normed_term_counts = {}
    B = sum(baseline_counts.values())
    A = sum(term_counts.values())
    for term,count in term_counts.items():
        try:
            # fraction of baseline distribution for this term
            b_frac = baseline_counts[term]/B
        except KeyError:
            b_frac = 0
        
        # fraction of primary term frequency distribution for this term
        a_frac = count/A
        
        factor = (a_frac-b_frac)
        normed_term_counts[term] = count * factor 
    return normed_term_counts

In [None]:
normalized_results = normalize_by_fraction(dad_terms,mom_terms)
sorted_list = list(reversed(sorted(normalized_results.items(), key = operator.itemgetter(1))))
sorted_list[:20]

Well, this sort of works. We could also comine the term-fractions in other ways, e.g. a ratio (a_frac/b_frac), or the relative difference ( [a_frac-b_frac]/a_frac ). 

In the end, we need to think harder about what differences and similarities we want to be able to highlight and ignore. See Part Two...