Code to experiment with calculating calibration.

In [1]:
import pandas as pd

Get topics for news the user clicked on and news user was shown (impressions)

In [2]:
behavior_df = pd.read_table("../data/test_mind_large/behaviors.tsv",
               header='infer',
               usecols=range(5),
               names=[
                   'impression_id', 'user', 'time',
                   'clicked_news', 'impressions'
               ])
behavior_df.fillna('', inplace = True)
behavior_df.head()
# Users may appear multiple times in behavior_df. 
# For the purposes of this experiment, we are treating each row as unique and 
# extracting history and the current impressions. However in the future, 
# all history for a user may need to be considered.

Unnamed: 0,impression_id,user,time,clicked_news,impressions
0,1,U134050,11/15/2019 8:55:22 AM,N12246 N128820 N119226 N4065 N67770 N33446 N10...,N91737-0 N30206-0 N54368-0 N117802-0 N18190-0 ...
1,2,U254959,11/15/2019 11:42:35 AM,N34011 N9375 N67397 N7936 N118985 N109453 N103...,N119999-0 N24958-0 N104054-0 N33901-0 N9250-0 ...
2,3,U499841,11/15/2019 9:08:21 AM,N63858 N26834 N6379 N85484 N15229 N65119 N1047...,N18190-0 N89764-0 N91737-0 N54368-0 N49978-1 N...
3,4,U107107,11/15/2019 5:50:31 AM,N12959 N8085 N18389 N3758 N9740 N90543 N129790...,N122944-1 N18190-0 N55801-0 N59297-0 N128045-0...
4,5,U492344,11/15/2019 5:02:25 AM,N109183 N48453 N85005 N45706 N98923 N46069 N35...,N64785-0 N82503-0 N32993-0 N122944-0 N29160-0 ...


In [3]:
news_df = pd.read_table("../data/test_mind_large/news.tsv",
            header = 'infer', 
            usecols = range(4),
            names = ['news_id', 'topic', 'subtopic', 'title'])
news_df.head()

Unnamed: 0,news_id,topic,subtopic,title
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an..."
1,N23144,health,weightloss,50 Worst Habits For Belly Fat
2,N86255,health,medical,Dispose of unwanted prescription drugs during ...
3,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...
4,N75236,health,voices,I Was An NBA Wife. Here's How It Affected My M...


In [4]:
user_clicked_topics = behavior_df[['impression_id', 'clicked_news']]
user_clicked_topics['news_id'] = user_clicked_topics['clicked_news'].apply(lambda x: x.split(' '))
user_clicked_topics = user_clicked_topics.explode('news_id')
user_clicked_topics.drop(columns=['clicked_news'], inplace=True)
user_clicked_topics = pd.merge(user_clicked_topics, news_df)
user_clicked_topics = user_clicked_topics.groupby(by='impression_id')['topic'].apply(list).reset_index(name="clicked_topics")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_clicked_topics['news_id'] = user_clicked_topics['clicked_news'].apply(lambda x: x.split(' '))


In [5]:
user_shown_topics = behavior_df[['impression_id', 'impressions']]
user_shown_topics['news_id'] = user_shown_topics['impressions'].apply(lambda x: x.split(' '))
user_shown_topics = user_shown_topics.explode('news_id')
user_shown_topics['news_id'] = user_shown_topics['news_id'].apply(lambda x: x.split("-")[0])
user_shown_topics.drop(columns=['impressions'], inplace=True)
user_shown_topics = pd.merge(user_shown_topics, news_df)
user_shown_topics = user_shown_topics.groupby(by='impression_id')['topic'].apply(list).reset_index(name="shown_topics")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_shown_topics['news_id'] = user_shown_topics['impressions'].apply(lambda x: x.split(' '))


In [6]:
topics_df = pd.merge(user_clicked_topics, user_shown_topics)
topics_df.head()

Unnamed: 0,impression_id,clicked_topics,shown_topics
0,1,"[foodanddrink, health, news, travel, sports, l...","[movies, lifestyle, news, tv, lifestyle, fooda..."
1,2,"[tv, tv, sports, news, sports, news, entertain...","[lifestyle, autos, news, travel, video, entert..."
2,3,"[sports, movies, lifestyle, sports, travel, ne...","[lifestyle, sports, movies, news, sports, spor..."
3,4,"[foodanddrink, lifestyle, foodanddrink, tv, ne...","[foodanddrink, lifestyle, weather, autos, vide..."
4,5,"[news, travel, news, news, movies, tv, news, l...","[lifestyle, health, lifestyle, foodanddrink, s..."


Calculate calibration

In [7]:
# from https://github.com/svrijenhoek/RADio/blob/main/dart/external/discount.py
import math
def harmonic_number(n):
    """Returns an approximate value of n-th harmonic number.
    http://en.wikipedia.org/wiki/Harmonic_number
    """
    # Euler-Mascheroni constant
    gamma = 0.57721566490153286060651209008240243104215933593992
    return gamma + math.log(n) + 0.5 / n - 1. / (12 * n ** 2) + 1. / (120 * n ** 4)

# from https://github.com/svrijenhoek/RADio/blob/main/dart/metrics/calibration.py
def compute_distr(items, adjusted=False):
    """Compute the topic distribution for a given list of topics."""
    n = len(items)
    sum_one_over_ranks = harmonic_number(n)
    count = 0
    distr = {}
    for topic in items:
        count += 1
        topic_freq = distr.get(topic, 0.)
        distr[topic] = topic_freq + 1 * 1 / count / sum_one_over_ranks if adjusted else topic_freq + 1 * 1 / n

    return distr

# from https://github.com/svrijenhoek/RADio/blob/main/dart/external/kl_divergence.py
def opt_merge_max_mappings(dict1, dict2):
    """ Merges two dictionaries based on the largest value in a given mapping.
    Parameters
    ----------
    dict1 : Dict[Any, Comparable]
    dict2 : Dict[Any, Comparable]
    Returns
    -------
    Dict[Any, Comparable]
        The merged dictionary
    """
    # we will iterate over `other` to populate `merged`
    merged, other = (dict1, dict2) if len(dict1) > len(dict2) else (dict2, dict1)
    merged = dict(merged)

    for key in other:
        if key not in merged or other[key] > merged[key]:
            merged[key] = other[key]
    return merged

# from https://github.com/svrijenhoek/RADio/blob/main/dart/external/kl_divergence.py
from scipy.stats import entropy
def compute_kl_divergence(s, q, alpha=0.001):
    """
    KL (p || q), the lower the better.
    alpha is not really a tuning parameter, it's just there to make the
    computation more numerically stable.
    """
    try:
        assert 0.99 <= sum(s.values()) <= 1.01
        assert 0.99 <= sum(q.values()) <= 1.01
    except AssertionError:
        print("Assertion Error")
        pass
    ss = []
    qq = []
    merged_dic = opt_merge_max_mappings(s, q)
    for key in sorted(merged_dic.keys()):
        q_score = q.get(key, 0.)
        s_score = s.get(key, 0.)
        ss.append((1 - alpha) * s_score + alpha * q_score)
        qq.append((1 - alpha) * q_score + alpha * s_score)
    return entropy(ss, qq, base=2)

# from https://github.com/svrijenhoek/RADio/blob/main/dart/metrics/calibration.py
# adapted (one function with adjusted parameter)
# note: the original function returns two divergencies:
# divergence_with_discount (adjusted=True),
# divergence_without_discount(adjusted=False)
# I am not clear on the statistical significance of these differences
# or which one is "better"
def topic_divergence(reading_history, recommendation, adjusted):
    freq_rec = compute_distr(recommendation, adjusted)
    freq_history = compute_distr(reading_history, adjusted)
    return compute_kl_divergence(freq_history, freq_rec)

In [8]:
topics_df['calibration_adjusted'] = topics_df.apply(lambda x: topic_divergence(x['clicked_topics'], x['shown_topics'], True), axis=1)
topics_df['calibration_not_adjusted'] = topics_df.apply(lambda x: topic_divergence(x['clicked_topics'], x['shown_topics'], False), axis=1)
topics_df.head()

Unnamed: 0,impression_id,clicked_topics,shown_topics,calibration_adjusted,calibration_not_adjusted
0,1,"[foodanddrink, health, news, travel, sports, l...","[movies, lifestyle, news, tv, lifestyle, fooda...",3.285972,3.031979
1,2,"[tv, tv, sports, news, sports, news, entertain...","[lifestyle, autos, news, travel, video, entert...",1.737349,0.484733
2,3,"[sports, movies, lifestyle, sports, travel, ne...","[lifestyle, sports, movies, news, sports, spor...",1.496836,2.006798
3,4,"[foodanddrink, lifestyle, foodanddrink, tv, ne...","[foodanddrink, lifestyle, weather, autos, vide...",2.416485,3.351102
4,5,"[news, travel, news, news, movies, tv, news, l...","[lifestyle, health, lifestyle, foodanddrink, s...",1.631904,0.566053


In [10]:
topics_df[['calibration_adjusted', 'calibration_not_adjusted']].agg(['mean', 'median'])

Unnamed: 0,calibration_adjusted,calibration_not_adjusted
mean,2.855803,2.417313
median,2.24115,1.78663
