In [19]:
%pylab inline

import operator
import datetime
from random import randint
import sys
import numpy as np
from tqdm import tqdm
import scipy.spatial
from pyemd import emd
import pandas as pd
from collections import defaultdict
import pickle

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# Dataset

Crawling plan (by Behrooz): https://docs.google.com/document/d/1D7zdEhEa01CYflIUXHmwzJrmEyGJOD-AQKOluV19C1o/edit

The changed codes for crawling are in "../crawl".

# Group Discovery

LCM code with instructions: https://github.com/tsudalab/SHIMR/tree/master/code/lcm53

The line used to extract groups:  lcm CfI -l 1 -u 5 csv/pmr.csv 10 csv/groups.csv

# Group Exploration

In [63]:
#some functions on user sets and group sets

def intersect(a, b):
    return list(set(a) & set(b))

def union(a, b):
    return list(set(a) | set(b))

def add_to_list(a, b):
    return list(a + b)

def is_inside(small_g, big_g):
    if supports_list[small_g] < supports_list[big_g]:
        if len(union(users_list[small_g], users_list[big_g])) == supports_list[big_g]:
            return True
        else:
            return False
    return False

def jaccard_sim(a,b):
    set_a = set(a)
    set_b = set(b)
    return len(set_a & set_b)/len(set_a | set_b)

def coverage(seed_group, groups):
    G = []
    for g in groups:
        G = union(G, users_list[g])
    return len(intersect(G,users_list[seed_group]))/supports_list[seed_group]

def diversity(groups):
    G = []
    for g in groups:
        G += users_list[g]
    if len(G) > 0:
        return len(set(G))/len(G)
    return 0

def unique_users(groups):
    G = []
    for g in groups:
        G += users_list[g]
    if len(G) > 0:
        return len(set(G))
    return 0

def descr_diversity(groups):
    I = []
    for g in groups:
        I += items_list[g]
    if len(I) > 0:
        return len(set(I))/len(I)
    return 0
    

def replace(groups, g, new_g):
    new_groups = groups.copy()
    new_groups.remove(g)
    new_groups.append(new_g)
    return new_groups


def fast_jaccard_sim(a,b):
    return len(a & b)/len(a | b)

In [94]:
def read_groups_data(data):
    
    #Reads LCM output and produces dictionaries with group characteristics. 
    
    users_list = {} 
    conference_list = {}
    items_list = {} 
    supports_list = {} 
    items_count = {}
    
    group_cnt = 1 
    num_users = 0

    with open(data, 'r') as fin:
        while True:
            line1 = fin.readline()
            if not line1:
                break
            split_1 = line1.split(' (')
            ids = [int(id_) for id_ in split_1[0].split(' ')]
            items_list[group_cnt] = [items_ids[id_] for id_ in ids]
            conference_list[group_cnt] = []
            for i in ids:
                if i > 30000:
                    conference_list[group_cnt].append(items_ids[i])
            items_count[group_cnt] = len(items_list[group_cnt])
            supports_list[group_cnt] = int(split_1[1].split(')')[0])


            line2 = fin.readline()
            users_list[group_cnt] = [int(id_) for id_ in line2[1:-1].split(' ')]
            max_ = max(users_list[group_cnt])
            if max_>num_users:
                num_users = max_

            group_cnt += 1
            
    users_list[0] = range(0,num_users+1)
    items_list[0] = []
    supports_list[0] = num_users+1
    conference_list[0] = []
    items_count[0] = 0
   
    return users_list, items_list, conference_list, supports_list, items_count

def get_inv_index(items_list, items_ids):
    items_inv_index = {}
    
    for _, item in items_ids.items():
        items_inv_index[item] = []
        
    for group, items in items_list.items():
        for item in items:
            items_inv_index[item].append(group)
            
    return items_inv_index

'''def get_groups_by_n_items(items_list):
    items_count = {}
    max_l = 0
    for group, items in items_list.items():
        l = len(items)
        if l in items_count:
            items_count[l].append(group)
        else:
            items_count[l] = [group]
            if l>max_l:
                max_l = l
    
    for l in range(100):
        if l not in items_count:
            items_count[l] = []
        
    return items_count'''

'def get_groups_by_n_items(items_list):\n    items_count = {}\n    max_l = 0\n    for group, items in items_list.items():\n        l = len(items)\n        if l in items_count:\n            items_count[l].append(group)\n        else:\n            items_count[l] = [group]\n            if l>max_l:\n                max_l = l\n    \n    for l in range(100):\n        if l not in items_count:\n            items_count[l] = []\n        \n    return items_count'

## Load groups data

In [95]:
%%time

items_ids = {}

with open('csv/conference_mapping.csv', 'r') as mapping:
    mapping.readline()
    for line in mapping:
        split_ = line.split(',')
        items_ids[int(split_[0])] = split_[1][:-1]
        
with open('csv/topic_mapping.csv', 'r') as mapping:
    mapping.readline()
    for line in mapping:
        split_ = line.split(',')
        items_ids[int(split_[0])] = split_[1][:-1]
        
user_id_mapping = {}

with open("csv/user_id_mapping.csv") as f:
    for line in f:
        parts = line.split(',')
        user_id_mapping[int(parts[0])] = int(parts[1])
        
data = "csv/groups.csv"
users_list, items_list, conference_list, supports_list, items_count = read_groups_data(data)
groups_by_item = get_inv_index(items_list,items_ids) 
#groups_by_n_items = get_groups_by_n_items(items_list)
all_groups = list(range(len(items_list)))

CPU times: user 1.06 s, sys: 51.2 ms, total: 1.11 s
Wall time: 1.17 s


## Extact rating distributions

In [96]:
%%time
pb_by_user_id = {}
with open('csv/publications.csv') as fin:
    fin.readline()
    
    for line in tqdm(fin):
        parts = line.split(',')
        id_ = int(parts[0])
        if id_ in user_id_mapping:
            id_ = user_id_mapping[id_]
            r = int(parts[3])
            conf = parts[2].strip(' ')

            if id_ in pb_by_user_id.keys():
                if conf in pb_by_user_id[id_].keys():
                    pb_by_user_id[id_][conf].append(r)
                else: 
                    pb_by_user_id[id_][conf] = [r]
            else:
                pb_by_user_id[id_] = {}
                pb_by_user_id[id_][conf] = [r]

744515it [00:02, 271896.85it/s]

CPU times: user 2.27 s, sys: 145 ms, total: 2.42 s
Wall time: 2.79 s





In [97]:
distributions_list = {}

for g in tqdm(all_groups):
    curr_distr = [0]*5
    for id_ in users_list[g]:
        if len(conference_list[g]) > 0:
            for conf in conference_list[g]:
                for r in pb_by_user_id[id_][conf]:

                    if r <= 2005:
                        curr_distr[0] += 1
                    elif r <= 2010:
                        curr_distr[1] += 1
                    elif r <= 2013:
                        curr_distr[2] += 1
                    elif r <= 2016:
                        curr_distr[3] += 1
                    else:
                        curr_distr[4] += 1  
        else:
            for conf in pb_by_user_id[id_]:
                for r in pb_by_user_id[id_][conf]:

                    if r <= 2005:
                        curr_distr[0] += 1
                    elif r <= 2010:
                        curr_distr[1] += 1
                    elif r <= 2013:
                        curr_distr[2] += 1
                    elif r <= 2016:
                        curr_distr[3] += 1
                    else:
                        curr_distr[4] += 1                   
                    
    num_pb = sum(curr_distr)        
    distributions_list[g] = np.array([v/num_pb for v in curr_distr])

100%|██████████| 33302/33302 [00:09<00:00, 3419.40it/s]


## Offline: find relevant groups (run only once)

This part is time consuming. Jaccard and EMD similarities matrices and indexes of groups inside each given group are precomputed and saved.

#### Jaccard

In [27]:
num_users = len(users_list[0])
inv_index = defaultdict(list)

users_by_group_as_list = []
for g in all_groups:
    users_by_group_as_list.append(set(users_list[g]))

In [38]:
#Specify the threshold of similarity 
threshold = 0.1

for i in tqdm(all_groups[:]):
    a = users_list[i]
    s = np.array(list( map(lambda g: jaccard_sim(a,g), users_by_group_as_list[i+1:]) ))
    groups = np.argwhere(s>threshold).T[0] + i + 1
    inv_index[i].extend(groups)
    list(map(lambda g: inv_index[g].append(i), groups))

100%|██████████| 33302/33302 [20:58<00:00, 26.45it/s] 


In [39]:
for key in tqdm(inv_index.keys()):
    inv_index[key] = list(set(inv_index[key]))

100%|██████████| 33302/33302 [00:16<00:00, 2006.11it/s]


In [40]:
with open('csv/inverted_index_jaccard.csv', 'w') as fin:
    keys = sorted(list(inv_index.keys()))
    for key in tqdm(keys):
        line = [str(g) for g in inv_index[key]]
        print(key, ' '.join(line), sep = ':', end='\n', file=fin)

100%|██████████| 33302/33302 [02:23<00:00, 232.33it/s] 


In [28]:
del inv_index

#### Groups inside 

In [39]:
inside_index = defaultdict(list)

In [None]:
for i in tqdm(all_groups[:]):
    s = np.array(list( map(lambda g: is_inside(g,i), all_groups[i+1:]) ))
    groups = np.argwhere(s>0).T[0] + i + 1
    inside_index[i].extend(groups)
    list(map(lambda g: inside_index[g].append(i), groups))

In [None]:
for key in tqdm(inside_index.keys()):
    inside_index[key] = list(set(inside_index[key]))

In [None]:
with open('csv/inverted_index_inside.csv', 'w') as fin:
    keys = sorted(list(inside_index.keys()))
    for key in tqdm(keys):
        line = [str(g) for g in inside_index[key]]
        print(key, ' '.join(line), sep = ':', end='\n', file=fin)

In [41]:
del inside_index

#### EMD

In [54]:
emd_inv_index = defaultdict(list)

In [55]:
distr_by_group_as_list = []
for g in all_groups:
    distr_by_group_as_list.append(distributions_list[g])

In [56]:
distance_matrix = 3*np.array([[0,0.25,0.5,0.75,1], 
                            [0.25,0,0.25,0.5,0.75], 
                            [0.5,0.25,0,0.25,0.5], 
                            [0.75,0.5,0.25,0,0.25], 
                            [1,0.75,0.5,0.25,0]]).astype('d')

In [64]:
#Specify the threshold of distance
threshold = 0.05

for i in tqdm(all_groups[:]):
    a = distributions_list[i]
    s = np.array(list( map(lambda g: emd(a,g,distance_matrix), distr_by_group_as_list[i+1:]) ))
    groups = np.argwhere(s<threshold).T[0] + i + 1
    emd_inv_index[i].extend(groups)
    list(map(lambda g: emd_inv_index[g].append(i), groups))

100%|██████████| 33302/33302 [9:08:08<00:00,  1.01it/s]   


In [65]:
for key in tqdm(emd_inv_index.keys()):
    emd_inv_index[key] = list(set(emd_inv_index[key]))

100%|██████████| 33302/33302 [00:07<00:00, 4405.71it/s]


In [66]:
with open('csv/inverted_index_emd.csv', 'w') as fin:
    keys = sorted(list(emd_inv_index.keys()))
    for key in tqdm(keys):
        line = [str(g) for g in emd_inv_index[key]]
        print(key, ' '.join(line), sep = ':', end='\n', file=fin)

100%|██████████| 33302/33302 [01:45<00:00, 315.49it/s] 


## Offline: Topics 

Topics for all the groups precomputed in processing.ipynb.

---

## If already precomputed: load relevant groups

In [98]:
distance_matrix = 3*np.array([[0,0.25,0.5,0.75,1], 
                            [0.25,0,0.25,0.5,0.75], 
                            [0.5,0.25,0,0.25,0.5], 
                            [0.75,0.5,0.25,0,0.25], 
                            [1,0.75,0.5,0.25,0]]).astype('d')

In [99]:
inside_index = {}

with open('csv/inverted_index_inside.csv') as fin:
    for line in tqdm(fin):
        parts = line.split(':')
        group_id = int(parts[0])
        similar_groups = [int(g) for g in parts[1].split()]
        inside_index[group_id] = similar_groups
        
inverted_index_jaccard = {}

with open('csv/inverted_index_jaccard.csv') as fin:
    for line in tqdm(fin):
        parts = line.split(':')
        group_id = int(parts[0])
        similar_groups = [int(g) for g in parts[1].split()]
        inverted_index_jaccard[group_id] = similar_groups#list(set(similar_groups) - set(inside_index[group_id]))
        
inverted_index_emd = {}

with open('csv/inverted_index_emd.csv') as fin:
    for line in tqdm(fin):
        parts = line.split(':')
        group_id = int(parts[0])
        similar_groups = [int(g) for g in parts[1].split()]
        inverted_index_emd[group_id] = similar_groups#list(set(similar_groups) - set(inside_index[group_id]))

33302it [00:00, 48289.18it/s] 
33302it [00:08, 3833.75it/s]
33302it [00:06, 4898.63it/s]


In [100]:
#Specify the number of topics (as in topic extraction)
num_topics = 10
topics_list = []

with open('csv/topics.csv') as fin:
    fin.readline()
    for line in tqdm(fin):
        parts = line.split('"')
        group_id = int(parts[0][:-1])
        raw_topics = eval(parts[1])
        topics = [0.]*num_topics
        def set_topic_val_by_ind(topic):
            val = topic[1]
            topics[topic[0]] = val*int(val>0.2)
        list(map(set_topic_val_by_ind, raw_topics))
        topics = np.array(topics)
        topics_list.append(topics)

33302it [00:03, 11083.38it/s]


### Exploration operators

In [101]:
# getNext() relevant groups

def get_similar_within(groups_space, seed_group):
    groups = groups_space.copy()
    similar_groups = inside_index[seed_group]
    groups = intersect(groups, similar_groups)

    return groups

def get_similar_around(groups_space, seed_group):
    groups = groups_space.copy()
    similar_groups = inverted_index_jaccard[seed_group]
    groups = intersect(groups, similar_groups)
    
    return groups

def get_similar_emd(groups_space, seed_group):
    groups = groups_space.copy()
    similar_groups = inverted_index_emd[seed_group]
    groups = intersect(groups, similar_groups)
    
    return groups

In [102]:
#Specify default time limit
T = 100

def by_example_around(seed_group,k,lowest_sim=0.2,time_limit=T,all_groups_space=all_groups.copy()):
    groups_space = all_groups_space[:]
    groups = get_similar_around(groups_space, seed_group)
   
    if (k != -1) & (len(groups) > k):
        groups = greedy_max_diversity(groups,k,time_limit)
        
    return groups

def by_example_within(seed_group,k,lowest_sim=0.2,time_limit=T,all_groups_space=all_groups.copy()):
    groups_space = all_groups_space[:]
    groups = get_similar_within(groups_space, seed_group)
    
    if (k != -1) & (len(groups) > k):
        groups = greedy_max_coverage(groups,seed_group,k,time_limit)
        
    return groups

def by_distribution(seed_group,k,lowest_sim=0.0,time_limit=T,all_groups_space=all_groups.copy()):
    groups_space = all_groups_space[:]
    groups = get_similar_emd(groups_space, seed_group)

    if (k != -1) & (len(groups) > k):
        groups = greedy_max_diversity(groups,k,time_limit)
        
    return groups

def by_topic(seed_group, k, time_limit = T, cosine_sim_threshold = 0.1):
    u = topics_list[seed_group]
    vals = np.array([u.dot(v) for v in topics_list])
    groups = list(np.argwhere(vals > cosine_sim_threshold).T[0])
    
    if seed_group in groups:
        groups.remove(seed_group)
    
    if (k != -1) & (len(groups) > k):
        groups = greedy_max_descr_diversity(groups,k,time_limit)
        
    return groups



facets = {
          'gender': ['male', 'female'], 
          'seniority': [ 'starting', 'junior', 'senior', 'highly senior', 'confirmed'],
          'productivity': ['active', 'very active', 'productive', 'very productive', 'prolific'],
          'publications': ['very few', 'few', 'fair', 'high', 'very high'],
          'country' : ['North America', 'UK/Ireland', 'Europe', 'Asia', 'Australia',
                        'South America', 'Middle East', 'other country'] 
        }


def by_facet(seed_group, k, facet, groups_space=all_groups, time_limit = T):
    groups = []
    items = set(items_list[seed_group]) - set(facets[facet])
    
    for value in facets[facet]:
        attributes = union(items, [value])
        value_groups = groups_space
        for attribute in attributes:
            value_groups = intersect(value_groups, groups_by_item[attribute])
        value_supports = [(g,supports_list[g]) for g in value_groups]
        value_supports = sorted(value_supports, key=operator.itemgetter(1), reverse=True)
        if len(value_supports) > 0:
            groups.append(value_supports[0][0])
            
        if (k != -1) & (len(groups) > k):
            groups = greedy_max_coverage(groups,seed_group,k,time_limit)
        
    return groups

def by_gender(seed_group, k):
    facet = 'gender'
    return by_facet(seed_group, k, facet)

def by_country(seed_group, k):
    facet = 'country'
    return by_facet(seed_group, k, facet)

def by_seniority(seed_group, k):
    facet = 'seniority'
    return by_facet(seed_group, k, facet)

def by_productivity(seed_group, k):
    facet = 'productivity'
    return by_facet(seed_group, k, facet)

def by_conference(seed_group, k, groups_space=all_groups, time_limit = T):
    groups = []
    confs = conference_list[seed_group]
    if len(confs) > 0:
        groups = groups_space[:]
    for conf in confs:
        groups = intersect(groups, groups_by_item[conf])
        
    if (k != -1) & (len(groups) > k):
            groups = greedy_max_diversity(groups,k,time_limit)
        
    return groups
    

In [103]:
#Greedy quality optimization

def greedy_max_diversity(records, k, time_limit, stop_visiting_once = False):
    current_records = records[:k]       
    new_records = records[:k]     
    total_time = 0.0            

    pointer = k-1
    nb_iterations = 1
    nb_lookups = 0
    
    current_diversity = diversity(current_records)
    
    while total_time < time_limit:
        nb_lookups += 1
        pointer += 1

        begin_time = datetime.datetime.now()
        
        replacement = randint(0,k-1)
        new_records = current_records[:]
        new_records.pop(replacement)
        
        if records[pointer] not in new_records:
            new_records.append(records[pointer])
            new_diversity = diversity(new_records)
        
            if new_diversity >= current_diversity:
                current_records = new_records[:]
                current_diversity = new_diversity
            
        end_time = datetime.datetime.now()

        duration = (end_time - begin_time).microseconds / 1000.0
        total_time += duration

        if pointer >= len(records)-1:
            if stop_visiting_once == False:
                pointer = k-1
                nb_iterations += 1
            else:
                break
                
    return current_records

def greedy_max_coverage(records, seed_group, k, time_limit, stop_visiting_once = False):
    current_records = records[:k]       
    new_records = records[:k]     
    total_time = 0.0            

    pointer = k-1
    nb_iterations = 1
    nb_lookups = 0
    
    current_coverage = coverage(seed_group,current_records)
    
    while total_time < time_limit:
        nb_lookups += 1
        pointer += 1

        begin_time = datetime.datetime.now()
        
        replacement = randint(0,k-1)
        new_records = current_records[:]
        new_records.pop(replacement)
        
        if records[pointer] not in new_records:
            new_records.append(records[pointer])
            new_coverage = coverage(seed_group,new_records)

            if new_coverage >= current_coverage:
                current_records = new_records[:]
                current_coverage = new_coverage
            
        end_time = datetime.datetime.now()

        duration = (end_time - begin_time).microseconds / 1000.0
        total_time += duration

        if pointer >= len(records)-1:
            if stop_visiting_once == False:
                pointer = k-1
                nb_iterations += 1
            else:
                break
                
    return current_records


def greedy_max_descr_diversity(records, k, time_limit, stop_visiting_once = False):
    current_records = records[:k]       
    new_records = records[:k]     
    total_time = 0.0            

    pointer = k-1
    nb_iterations = 1
    nb_lookups = 0
    
    current_diversity = descr_diversity(current_records)
    
    while total_time < time_limit:
        nb_lookups += 1
        pointer += 1

        begin_time = datetime.datetime.now()
        
        replacement = randint(0,k-1)
        new_records = current_records[:]
        new_records.pop(replacement)
        
        if records[pointer] not in new_records:
            new_records.append(records[pointer])
            new_diversity = descr_diversity(new_records)
        
            if new_diversity >= current_diversity:
                current_records = new_records[:]
                current_diversity = new_diversity
            
        end_time = datetime.datetime.now()

        duration = (end_time - begin_time).microseconds / 1000.0
        total_time += duration

        if pointer >= len(records)-1:
            if stop_visiting_once == False:
                pointer = k-1
                nb_iterations += 1
            else:
                break
                
    return current_records

In [128]:
def print_groups(current_records):
    k = len(current_records)
    if k > 0:
        for i in range(k):
            print(str(i+1)+". G"+str(current_records[i])+": "+str(items_list[current_records[i]])
                  +"("+str(supports_list[current_records[i]])+" members)") 

---

#### Exploration function

In [138]:
def Explore(groups_space, exploration_type, **kwargs):
    if exploration_type == 'by-facet':  
        facet = kwargs.get('facet')
        seed_group = kwargs.get('seed_group', 5)
        k = kwargs.get('groups_num', 5)
        time_limit = kwargs.get('time_limit', 1000)
        
        print('seed: '+str(items_list[seed_group]))
        groups = by_facet(seed_group,k,facet,groups_space.copy(),time_limit)

    elif exploration_type == 'by-example-around':
        seed_group = kwargs['seed_group']
        k = kwargs.get('groups_num', 5)
        time_limit = kwargs.get('time_limit', 1000)
        lowest_acceptable_similarity = kwargs.get('lowest_acceptable_similarity', 0.2)
        
        print('seed: '+str(items_list[seed_group]))
        groups = by_example_around(seed_group,k,lowest_acceptable_similarity,time_limit,groups_space.copy())
        
    elif exploration_type == 'by-example-within':    
        seed_group = kwargs['seed_group']
        k = kwargs.get('groups_num', 5)
        time_limit = kwargs.get('time_limit', 1000)
        lowest_acceptable_similarity = kwargs.get('lowest_acceptable_similarity', 0.2)
        
        print('seed: '+str(items_list[seed_group]))
        groups = by_example_within(seed_group,k,lowest_acceptable_similarity,time_limit,groups_space.copy())
        
    elif exploration_type == 'by-distribution': 
        seed_group = kwargs['seed_group']
        k = kwargs.get('groups_num', 5)
        time_limit = kwargs.get('time_limit', 1000)
        lowest_acceptable_similarity = kwargs.get('lowest_acceptable_similarity', 0.2)
        
        print('seed: '+str(items_list[seed_group]))
        groups = by_distribution(seed_group,k,lowest_acceptable_similarity,time_limit,groups_space.copy())  
        
    elif exploration_type == 'by-topic': 
        seed_group = kwargs['seed_group']
        k = kwargs.get('groups_num', 5)
        
        print('seed: '+str(items_list[seed_group]))
        groups = by_topic(seed_group,k)
      
    elif exploration_type == 'by-conference':  
        seed_group = kwargs['seed_group']
        k = kwargs.get('groups_num', 5)
        time_limit = kwargs.get('time_limit', 1000)
        
        print('seed: '+str(items_list[seed_group]))
        groups = by_conference(seed_group,k,groups_space.copy(),time_limit)
        
    else:
        print('Exploration type not recognised!')
        return -1
    
    print_groups(groups)
    print("- diversity: " + str(diversity(groups)) + " (1.0 being the most diverse)") 
    print('- coverage: ', coverage(seed_group,groups))
                
    return groups

In [146]:
Explore(all_groups, 'by-example-within', seed_group = 20, groups_num = 5, time_limit=200)

seed: ['CIKM', 'PVLDB']
1. G22: ['CIKM', 'PVLDB', 'male'](113 members)
2. G12901: ['female', 'CIKM', 'PVLDB'](22 members)
3. G1627: ['Lecture Notes in Computer Science', 'CIKM', 'ICDE', 'PVLDB', 'male'](27 members)
4. G11868: ['ICDM', 'IEEE Trans. Knowl. Data Eng.', 'CIKM', 'PVLDB', 'CoRR'](21 members)
5. G11869: ['ICDM', 'IEEE Trans. Knowl. Data Eng.', 'CIKM', 'PVLDB', 'male'](21 members)
- diversity: 0.6617647058823529 (1.0 being the most diverse)
- coverage:  1.0


[22, 12901, 1627, 11868, 11869]

In [147]:
Explore(all_groups, 'by-example-around', seed_group = 20, groups_num = 5, time_limit=200)

seed: ['CIKM', 'PVLDB']
1. G39: ['SIGMOD Conference', 'PVLDB', 'male', 'CoRR'](192 members)
2. G16: ['CIKM'](500 members)
3. G25250: ['IEEE Data Eng. Bull.', 'SIGMOD Conference', 'CIKM', 'ICDE', 'PVLDB'](27 members)
4. G15651: ['VLDB J.', 'IEEE Trans. Knowl. Data Eng.', 'prolific', 'CIKM', 'PVLDB'](27 members)
5. G5203: ['EDBT', 'SIGMOD Conference', 'CIKM', 'PVLDB', 'CoRR'](27 members)
- diversity: 0.8240620957309185 (1.0 being the most diverse)
- coverage:  1.0


[39, 16, 25250, 15651, 5203]

In [148]:
Explore(all_groups, 'by-topic', seed_group = 20, groups_num = 5, time_limit=200)

seed: ['CIKM', 'PVLDB']
1. G33293: ['GRADES'](10 members)
2. G9495: ['BigData', 'EDBT', 'very productive', 'ICDE', 'PVLDB'](10 members)
3. G13322: ['female', 'highly senior', 'Encyclopedia of Database Systems (2nd ed.)'](18 members)
4. G13610: ['productive', 'WWW (Companion Volume)', 'North America'](10 members)
5. G14392: ['senior', 'IEEE Trans. Knowl. Data Eng.', 'SIGMOD Conference', 'CoRR'](15 members)
- diversity: 1.0 (1.0 being the most diverse)
- coverage:  0.0962962962962963


[33293, 9495, 13322, 13610, 14392]

In [149]:
Explore(all_groups, 'by-distribution', seed_group = 20, groups_num = 5, time_limit=1000)

seed: ['CIKM', 'PVLDB']
1. G22940: ['Asia', 'EDBT', 'IEEE Trans. Knowl. Data Eng.', 'CIKM'](10 members)
2. G8113: ['highly senior', 'KDD', 'North America', 'SIGMOD Conference', 'male'](10 members)
3. G32718: ['Australia', 'Lecture Notes in Computer Science', 'confirmed'](10 members)
4. G4582: ['KDD', 'Europe'](38 members)
5. G22039: ['Inf. Sci.', 'SIGIR', 'CIKM'](15 members)
- diversity: 1.0 (1.0 being the most diverse)
- coverage:  0.17037037037037037


[22940, 8113, 32718, 4582, 22039]

In [151]:
Explore(all_groups, 'by-facet',  facet = 'gender', seed_group = 20, groups_num = 5)

seed: ['CIKM', 'PVLDB']
1. G22: ['CIKM', 'PVLDB', 'male'](113 members)
2. G12901: ['female', 'CIKM', 'PVLDB'](22 members)
- diversity: 1.0 (1.0 being the most diverse)
- coverage:  1.0


[22, 12901]

In [152]:
Explore(all_groups, 'by-conference', seed_group = 10, groups_num = 5, lowest_acceptable_similarity = 0.0,
        time_limit=200)

seed: ['ICDE', 'male']
1. G25936: ['IEEE Data Eng. Bull.', 'senior', 'ICDE', 'PVLDB', 'CoRR'](10 members)
2. G9221: ['BigData', 'very productive', 'North America', 'ICDE'](11 members)
3. G1083: ['Europe', 'prolific', 'Encyclopedia of Database Systems (2nd ed.)', 'SIGMOD Conference', 'ICDE'](14 members)
4. G32817: ['Australia', 'VLDB J.', 'ICDE', 'PVLDB', 'CoRR'](15 members)
5. G1357: ['IEEE Trans. Knowl. Data Eng.', 'North America', 'Encyclopedia of Database Systems (2nd ed.)', 'confirmed', 'ICDE'](15 members)
- diversity: 1.0 (1.0 being the most diverse)
- coverage:  0.16666666666666666


[25936, 9221, 1083, 32817, 1357]

---

## Learning exploration strategy

#### Use case: PCs gathering 

In [162]:
author_ids = {}

with open("csv/authors_unique.csv") as f_authors:
    f_authors.readline()
    for line in f_authors:
        line = line.strip()
        parts = line.split(",")
        author_code = parts[0]
        raw_name = parts[1].strip().split(' ')
        author_name = raw_name[0] + ' ' + raw_name[-1]
        if author_name not in list(author_ids.keys()):
            author_ids[author_name] = int(author_code)
        #else:
            #print("Found another " + author_name)
            
        if len(raw_name) > 2:
            author_name = raw_name[0] + ' ' + raw_name[1] + ' ' + raw_name[2] 
            if author_name not in author_ids:
                author_ids[author_name] = int(author_code)
        if len(raw_name) > 3:
            author_name = raw_name[0] + ' ' + raw_name[1] + ' ' + raw_name[2] + ' ' + raw_name[3] 
            if author_name not in author_ids:
                author_ids[author_name] = int(author_code)
                

In [164]:
PC_raw = ['Fei Chiang','Valter Crescenzi','Daniel Deutch','Irini Fundulaki','Luis Galarraga',
          'Melanie Herschel','Aidan Hogan','H. V. Jagadish','Yaron Kanza',
          'Ioana Manolescu','Beng Chin Ooi','Paolo Papotti','Rachel Pottinger','Simon Razniewski', 
          'Sudip Roy','Marc Spaniol','Danai Symeonidou','Saravanan Thirumuruganathan','Yannis Velegrakis']

                
with open("csv/WebDB2017PC.csv", 'w') as PC:
    for line in PC_raw:
        raw_name = line.split(' ')
        name = line
        if len(raw_name) <= 2 and name in author_ids:
            PC.write(str(author_ids[name])+','+name+'\n')
        else:
            name = raw_name[0] + ' ' + raw_name[-1]
            if name in author_ids:
                PC.write(str(author_ids[name])+','+name+'\n')
            else:
                print(name + " not found in authors!")


Luis Galarraga not found in authors!
Danai Symeonidou not found in authors!


In [165]:
PC_raw = ['Matthias Boehm', 'Angela Bonifati','Fei Chiang', 'Xu Chu', 
            'Valter Crescenzi', 'Marina Danilevsky','Luis Galarraga', 'Wolfgang Gatterbauer',
            'Wendy H. Wang', 'Yaron Kanza', 'Guoliang Li', 'Jignesh M. Patel', 'Ioana Manolescu',
            'Amelie Marian', 'Felix Naumann', 'Kun Qian', 'Theodoros Rekatsinas', 
            'Sudip Roy', 'Semih Salihoglu', 'Vasilis Vassalos', 'Yannis Velegrakis', 'Jiannan Wang', 
            'Jun Yang']

with open("csv/WebDB2018PC.csv", 'w') as PC:
    for line in PC_raw:
        raw_name = line.split(' ')
        name = line
        if len(raw_name) <= 2 and name in author_ids:
            PC.write(str(author_ids[name])+','+name+'\n')
        else:
            name = raw_name[0] + ' ' + raw_name[-1]
            if name in author_ids:
                PC.write(str(author_ids[name])+','+name+'\n')
            else:
                print(name + " not found in authors!")



del author_ids
del PC_raw

Luis Galarraga not found in authors!


#### Load PCs

In [166]:
with open("csv/WebDB2017PC.csv") as inp:
    PC = []
    for line in inp:
        split_ = line.split(',')
        PC.append(user_id_mapping[int(split_[0])])
        
with open("csv/WebDB2018PC.csv") as inp:
    testPC = []
    for line in inp:
        split_ = line.split(',')
        testPC.append(user_id_mapping[int(split_[0])])

#### Environment

In [186]:
class Environment:
    def __init__(self, init_display, targets, k):
        self.C = 0.1
        self.targets = targets
        self.k = k
        
        self.display_history = []
        self.seed_history = []
        self.action_history = []
        self.reward_history = [0]
        self.seen_targets = []
        self.seen_groups = []
        self.targets_visits = [0]*len(targets)
        
        self.init_display = init_display
        self.display = init_display
        self.seed = self.choose_seed()
        self.state = self.get_state()
        self.actions = [by_example_around, by_example_within, self.undo, by_gender, by_country, 
                        by_seniority, by_productivity, by_distribution, by_topic, by_conference]
        
        
    def undo(self, seed, k):
        new_display = self.display_history[max(-2, -len(self.display_history))]
        self.display_history = self.display_history[:max(-2, -len(self.display_history)+1)]
        return new_display
        
    def step(self, action):    
        self.seed_history.append(self.seed)
        self.action_history.append(action)
        self.display_history.append(self.display)
        
        self.display = self.actions[action](self.seed, self.k)    
        self.seed = self.choose_seed() 
        reward = self.get_reward(action)
        
        
        self.reward_history.append(reward)
        self.state = self.get_state()
        
        if reward > 0:
            self.seen_targets = union(self.seen_targets, set(intersect(users_list[self.seed], self.targets)))
            self.seen_groups = union(self.seen_groups, [self.seed])

        return self.state, reward
    
    def choose_seed(self):
        max_targets = 0
        visited_targets = []
        new_seed = None
        
        groups = list(set(self.display) - set(self.seen_groups))
        if len(groups) > 0:
            new_seed = np.random.choice(groups)
            
        seed_candidates = []
        for g in groups:
            targets_in_g = set(intersect(users_list[g], self.targets)) - set(self.seen_targets)
            num_targets = len(targets_in_g)
            if num_targets > 0: 
                seed_candidates.append(g)
        
        if len(seed_candidates) > 0:
            new_seed = np.random.choice(seed_candidates)

        return new_seed
    
    def get_reward(self,action):
        if self.seed is not None:
            targets_in_seed = set(intersect(users_list[self.seed], self.targets))
        
            if len(targets_in_seed)/len(users_list[self.seed]) >= self.C:
                reward = len(intersect(set(users_list[self.seed]) - set(self.seen_targets), self.targets))#/len(users_list[self.seed])

                for t in range(len(self.targets)):
                    if self.targets[t] in targets_in_seed:
                        self.targets_visits[t] += 1

                return reward
       
        return 0
       
    
    def get_state(self):
        new_state = [0]*54
        
        if len(self.display)>0:
            new_state[min(int(diversity(self.display)/0.2),4)] = 1 #[0,4]
            new_state[5+int(len(self.display)>1)] = 1 #[5,6] 
            if len(self.seed_history)>0 and self.seed_history[-1] is not None:
                new_state[7+min(int(coverage(self.seed_history[-1],self.display)/0.2),4)] #[7,11]
            

        if self.seed is not None:
            new_state[11] = int(supports_list[self.seed] <= 15) 
            new_state[12] = int((supports_list[self.seed] > 15) and (supports_list[self.seed] <= 50))
            new_state[13] = int((supports_list[self.seed] > 50) and (supports_list[self.seed] <= 100))
            new_state[14] = int((supports_list[self.seed] > 100) and (supports_list[self.seed] <= 200))
            new_state[15] = int((supports_list[self.seed] > 200) and (supports_list[self.seed] <= 500))
            new_state[16] = int(supports_list[self.seed] > 500)#[11,16] 
            
            num_it = len(conference_list[self.seed])
            new_state[17] = int(num_it == 0)
            new_state[18] = int((num_it > 0) and (num_it <= 2))
            new_state[19] = int(num_it > 2)
            
            num_it = len(set(items_list[self.seed]) - set(conference_list[self.seed]))
            new_state[20] = int(num_it == 0)
            new_state[21] = int((num_it > 0) and (num_it <= 2))
            new_state[22] = int(num_it > 2)
            
            div = len(intersect(users_list[self.seed], self.seen_targets))
            new_state[23] = int(div == 0)
            new_state[24] = int(div > 0) #[23,24]
            
            
            emds = np.array([emd(distributions_list[self.seed],np.array([1./5,1./5,1./5,1./5,1./5]),distance_matrix),
                    emd(distributions_list[self.seed],np.array([1.,0,0,0,0]),distance_matrix),
                    emd(distributions_list[self.seed],np.array([0,0,0,0,1.]),distance_matrix)])
            
            new_state[25+np.argmax(emds)] = 1 #[25,27]
            
            div = len(self.seen_targets)
            new_state[28+min(int(div/2),4)] = 1 #[28,32]
        
            if len(intersect(facets['gender'], items_list[self.seed]))>0:
                new_state[33] = 1
            else:
                new_state[34] = 1
            if len(intersect(facets['country'], items_list[self.seed]))>0:
                new_state[35] = 1
            else:
                new_state[36] = 1
            if len(intersect(facets['seniority'], items_list[self.seed]))>0:
                new_state[37] = 1
            else:
                new_state[38] = 1
            if len(intersect(facets['productivity'], items_list[self.seed]))>0:
                new_state[39] = 1   
            else:
                new_state[40] = 1


        if self.reward_history[-1] > 0:
            new_state[41] = 1
        else:
            new_state[42] = 1
               
        if len(self.action_history) > 2:
            new_state[43+self.action_history[-1]] = 1
            
        return new_state
    
    def reset(self):
        self.display = self.init_display
        self.display_history = [self.display]
        self.action_history = []
        self.reward_history = [0]
        self.seen_targets = []
        self.seen_groups = []
        self.targets_visits = [0]*len(self.targets)
        
        self.seed = self.choose_seed()
        self.seed_history = [self.seed]
        self.state = self.get_state()
        
        return self.state, 0 
    
    def random_reset(self):
        
        self.display = np.random.permutation(all_groups)[:self.k]
       
        self.display_history = [self.display]
        
        self.action_history = []
        self.reward_history = [0]
        self.seen_targets = []
        self.seen_groups = []
        self.targets_visits = [0]*len(self.targets)
        
        self.seed = self.choose_seed()
        self.seed_history = [self.seed]
        self.state = self.get_state()
        
        return self.state, 0

#### Agent 

In [187]:
class Policy:
    def __init__(self, init_weights, eps=0.1, alpha=0.01, gamma=0.5):
        self.eps = eps
        self.alpha = alpha
        self.gamma = gamma
        self.weights = np.array(init_weights)
        self.actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 
        self.freq = np.array([0]*len(self.actions))
    
    def tiling(self, state, a):
        return np.array([0]*(len(state)*a)+list(state)+[0]*(len(state)*(len(self.actions)-a-1)))
        
        
    def get_random_action(self, state):
        new_action = np.argmin(self.freq)
        self.freq[new_action] += 1
        return new_action
        
    def get_action(self, state):
        possible_actions = self.actions
        
        if sum(state[11:17]) == 0: #if seed is None only undo 
            return 2

        rand = np.random.choice([0, 1], p = [1 - self.eps, self.eps] )
        if rand == 1:
            return self.get_random_action(state)
 
        max_Q = -np.inf
        new_action = np.random.choice(possible_actions)
        
        possible_new_actions = []
        
        for a in possible_actions:
            state_tiling = self.tiling(state, a)
            Q = sum(state_tiling*self.weights)
            if Q > max_Q:
                max_Q = Q
                possible_new_actions = [a]
            elif Q == max_Q:
                possible_new_actions.append(a)
                
        new_action = np.random.choice(possible_new_actions)
        self.freq[new_action] += 1
        
        return new_action
        
    def sarsa_update_weights(self,s,a,r,new_s,new_a):
        self.weights = self.weights + self.alpha*(r+self.gamma*sum(self.weights*self.tiling(new_s, new_a))-
                                                  sum(self.weights*self.tiling(s, a)))*self.tiling(s,a)
        
    def sarsa_terminal_update(self,s,a,r):
        self.weights = self.weights + self.alpha*(r-sum(self.weights*self.tiling(s, a)))*self.tiling(s,a)

#### SARSA

In [188]:
action_names = {0:'by_example_around', 1:'by_example_within', 2:'undo',
                3:'by_gender', 4:'by_country', 5:'by_seniority', 6:'by_productivity',
                7:'by_distribution', 8: 'by-topic', 9:'by-conference'}

def semi_gradient_sarsa(environment, policy, episodes_num = 100, episode_length = 15, verbose = False):
    history = {'weights': [], 'rewards' : [], 'steps': []}
    rewards = []
    for episode in range(episodes_num):
        episode_rewards = []
        episode_weights = []
        s, r = environment.reset()
        a = policy.get_action(s)

        episode_weights.append(policy.weights)

        for t in range(episode_length):
            new_s, r = environment.step(a)
            
            if verbose:
                print('Action: ', action_names[a])
                print('Display:')
                for i,g in enumerate(environment.display):
                    print(i, ': ', items_list[g], len(users_list[g]), len(intersect(users_list[g],environment.targets)))

                if environment.seed is None:
                    print('Seed: None')
                else:
                    print('Seed: ', items_list[environment.seed])
                    print('size: ', len(users_list[environment.seed]),
                          ', targets: ', len(intersect(users_list[environment.seed],environment.targets)))

                print('Reward: ', r)
            
            
            episode_rewards.append(r)
            if sum(episode_rewards) >= len(environment.targets)/2:
                s = new_s
                break
            new_a = policy.get_action(new_s)
            policy.sarsa_update_weights(s,a,r,new_s,new_a)
            episode_weights.append(policy.weights)
            s = new_s
            a = new_a
        
        policy.sarsa_terminal_update(s,a,r)
        episode_weights.append(policy.weights)
        episode_rewards.append(r)
        
        rewards.append(sum(episode_rewards))
        history['weights'].append(episode_weights)
        history['rewards'].append(episode_rewards)
        history['steps'].append(t)
        
        if verbose:
            print(environment.targets_visits)
            
    return rewards, history

#### Offline learning

In [194]:
num_fet = 54
num_actions = 10

environment = Environment(init_display = [0], targets = PC, k = 5)
test_environment = Environment(init_display = [0], targets = testPC, k = 5)

new_policy = Policy(init_weights = [0]*(num_fet*num_actions), eps = 1., alpha = 0.002, gamma = 0.5)
test_policy = Policy(init_weights = [0]*(num_fet*num_actions), eps = 0., alpha = 0.00, gamma = 0.5)

new_rewards = []
new_steps = []
test_rewards = []
test_steps = []

new_history = {'weights': [], 'rewards' : [], 'steps': []}
test_history = {'weights': [], 'rewards' : [], 'steps': []}

In [195]:
#Save multiple learning curves for plotting
learning_curves_train = []
learning_curves_test = []
learned_weights = []

In [None]:
curves_number = 10
episodes_number = 500

for i in tqdm(range(curves_number)):
    new_policy = Policy(init_weights = [0]*(num_fet*num_actions))
    test_policy = Policy(init_weights = [0]*(num_fet*num_actions))
    new_rewards = []
    new_steps = []
    test_rewards = []
    test_steps = []
    
    for eps in tqdm(10./np.array(range(10,episodes_number+10,1))): 
        freq = new_policy.freq
        test_policy = Policy(init_weights = list(new_policy.weights[:]), eps = eps, alpha = 0.0, gamma = 0.5)
        test_policy.freq = freq
        rewards, history = semi_gradient_sarsa(test_environment, test_policy, episodes_num = 1, episode_length = 100)
        test_rewards += rewards
        #test_steps += history['steps']


        new_policy = Policy(init_weights = list(new_policy.weights[:]), eps = eps, alpha = 0.002, gamma = 0.5)
        new_policy.freq = freq
        rewards, history = semi_gradient_sarsa(environment, new_policy, episodes_num = 1, episode_length = 100)
        new_rewards += rewards
        new_steps += history['steps']
        #total_steps += history['steps'][0]

        freq = new_policy.freq

    plt.figure(figsize=(12,6))
    #plt.plot([np.mean(new_steps[i:i+10]) for i in range(len(new_steps)-10)], label = 'steps')
    plt.plot([np.mean(new_steps[i:i+10]) for i in range(len(new_steps)-10)], label = 'train')
    plt.plot([np.mean(test_steps[i:i+10]) for i in range(len(new_steps)-10)], label = 'test')

    plt.legend()
    plt.grid()
    plt.show()

    learning_curves_train.append(new_steps)
    learning_curves_test.append(test_steps)
    learned_weights.append(list(new_policy.weights[:]))

In [None]:
with open('steps_train.pickle','wb') as f:
    pickle.dump(learning_curves_train, f)

with open('steps_test.pickle','wb') as f:
    pickle.dump(learning_curves_test, f)

with open('steps_learned_weights.pickle','wb') as f:
    pickle.dump(learned_weights, f)

In [151]:
# Print a session
new_policy = Policy(init_weights = list(new_policy.weights[:]), eps = 0.00, alpha = 0.0, gamma = 0.0)
rewards, history = semi_gradient_sarsa(test_environment, new_policy, episodes_num = 1, episode_length = 10, verbose = True)

Action:  by_distribution
Display:
0 :  ['VLDB J.', 'Europe', 'confirmed', 'ICDE', 'male'] 12 0
1 :  ['IEEE Data Eng. Bull.', 'highly senior', 'prolific', 'PVLDB'] 12 1
2 :  ['SAC', 'confirmed', 'CIKM', 'male'] 10 0
3 :  ['very productive', 'North America', 'PVLDB', 'male'] 51 5
4 :  ['female', 'confirmed', 'ICDE'] 27 1
Seed:  ['female', 'confirmed', 'ICDE']
size:  27 , targets:  1
Reward:  0
Action:  by-conference
Display:
0 :  ['highly senior', 'IEEE Trans. Knowl. Data Eng.', 'North America', 'ICDE'] 14 0
1 :  ['highly senior', 'IEEE Trans. Knowl. Data Eng.', 'Europe', 'ICDE', 'male'] 10 2
2 :  ['VLDB J.', 'EDBT', 'North America', 'ICDE', 'male'] 11 0
3 :  ['Knowl. Inf. Syst.', 'WSDM', 'prolific', 'ICDE'] 10 0
4 :  ['Australia', 'Lecture Notes in Computer Science', 'prolific', 'ICDE', 'CoRR'] 10 0
Seed:  ['highly senior', 'IEEE Trans. Knowl. Data Eng.', 'Europe', 'ICDE', 'male']
size:  10 , targets:  2
Reward:  2
Action:  by-conference
Display:
0 :  ['very productive', 'IEEE Trans. Kn

Action:  by-topic
Display:
0 :  ['T. Large-Scale Data- and Knowledge-Centered Systems', 'female'] 10 0
1 :  ['GLOBECOM'] 25 0
2 :  ['CIDR', 'junior', 'North America'] 11 1
3 :  ['IEEE Data Eng. Bull.', 'productive'] 17 0
4 :  ['ICDCS', 'prolific', 'male'] 40 0
Seed:  ['CIDR', 'junior', 'North America']
size:  11 , targets:  1
Reward:  0
Action:  by-conference
Display:
0 :  ['CIDR', 'very productive', 'SIGMOD Conference', 'male', 'CoRR'] 34 3
1 :  ['CIDR', 'female', 'SIGMOD Conference', 'ICDE'] 10 1
2 :  ['CIDR', 'very active'] 16 1
3 :  ['CIDR', 'productive', 'male'] 18 1
4 :  ['CIDR', 'prolific', 'male', 'CoRR'] 36 1
Seed:  ['CIDR', 'very active']
size:  16 , targets:  1
Reward:  0
Action:  by-conference
Display:
0 :  ['CIDR', 'highly senior', 'prolific', 'CoRR', 'PVLDB'] 15 1
1 :  ['CIDR', 'highly senior', 'very productive', 'PVLDB', 'male'] 11 0
2 :  ['CIDR', 'very active', 'North America'] 11 1
3 :  ['starting', 'CIDR', 'CoRR'] 10 1
4 :  ['CIDR', 'North America', 'confirmed', 'CoRR

Seed:  ['very productive', 'CoRR']
size:  412 , targets:  8
Reward:  0
Action:  by-conference
Display:
0 :  ['ECIR', 'highly senior', 'Lecture Notes in Computer Science', 'male', 'CoRR'] 11 0
1 :  ['senior', 'ECIR', 'prolific', 'CoRR'] 10 0
2 :  ['ICWSM', 'WWW (Companion Volume)', 'Europe', 'prolific', 'CoRR'] 10 0
3 :  ['ICWSM', 'very productive', 'WWW (Companion Volume)', 'male', 'CoRR'] 10 0
4 :  ['ICWSM', 'WWW', 'North America', 'confirmed', 'CoRR'] 16 0
Seed:  ['ECIR', 'highly senior', 'Lecture Notes in Computer Science', 'male', 'CoRR']
size:  11 , targets:  0
Reward:  0
Action:  by-conference
Display:
0 :  ['ECIR', 'Lecture Notes in Computer Science', 'Europe', 'CIKM', 'CoRR'] 10 0
1 :  ['ECIR', 'highly senior', 'Lecture Notes in Computer Science', 'CIKM', 'CoRR'] 10 0
2 :  ['ECIR', 'CEUR Workshop Proceedings', 'Lecture Notes in Computer Science', 'CoRR'] 11 0
3 :  ['ECIR', 'CEUR Workshop Proceedings', 'Lecture Notes in Computer Science', 'male', 'CoRR'] 10 0
4 :  ['ECIR', 'Lect

#### Data for the reward constant C

In [None]:
%%time 
PC = testPC
num_seen_users = []
num_seen_targets = []
num_target_groups = []
num_targets = len(PC)
C = np.arange(0,0.4,0.005)

for c in tqdm(C):
    discoverable_targets = set()
    num_g = 0
    for g in all_groups:
        targets_in_g = intersect(users_list[g], PC)
        if len(targets_in_g)/supports_list[g] > c:#*num_targets/supports_list[0]:
            discoverable_targets = discoverable_targets | set(users_list[g])
            num_g += 1
    num_target_groups.append(num_g)        
    num_seen_users.append(len(discoverable_targets))
    num_seen_targets.append(len(intersect(discoverable_targets,PC)))
    print(num_seen_users[-1], num_seen_targets[-1])

In [50]:
with open('pickles/WebDB2018_num_users.pickle','wb') as f:
    pickle.dump(num_seen_users, f)
    
with open('pickles/WebDB2018_num_targets.pickle','wb') as f:
    pickle.dump(num_seen_targets, f)
    
with open('pickles/WebDB2018_num_target_groups.pickle','wb') as f:
    pickle.dump(num_target_groups, f)

### Generate text for topics extraction

In [204]:
%%time
all_pb_by_user_id = {}
pb_list = []

with open('csv/publications.csv') as fin:
    fin.readline()
    r = 0
    for line in fin:
        parts = line.split(',')
        id_ = int(parts[0])
        
        if id_ in user_id_mapping:
            id_ = user_id_mapping[id_ ]
            conf = parts[2].strip(' ')

            if id_ in all_pb_by_user_id.keys():
                all_pb_by_user_id[id_] += str(r) + ' '
                pb_list.append(r)
            else:
                all_pb_by_user_id[id_] = str(r) + ' '
                pb_list.append(r)
            r += 1

CPU times: user 1.38 s, sys: 93.8 ms, total: 1.48 s
Wall time: 1.68 s


In [205]:
%%time
pb_by_user_id = {}

with open('csv/publications.csv') as fin:
    fin.readline()
    r = 0
    for line in fin:
        parts = line.split(',')
        id_ = int(parts[0])
        
        if id_ in user_id_mapping:
            id_ = user_id_mapping[id_]
            conf = parts[2].strip(' ')

            if id_ in pb_by_user_id.keys():
                if conf in pb_by_user_id[id_].keys():
                    pb_by_user_id[id_][conf] += str(r) + ' '
                else: 
                    pb_by_user_id[id_][conf] = str(r) + ' '
            else:
                pb_by_user_id[id_] = {}
                pb_by_user_id[id_][conf] = str(r) + ' '
                
            r += 1


CPU times: user 1.5 s, sys: 67 ms, total: 1.56 s
Wall time: 1.83 s


In [52]:
G = range(0, len(users_list))
texts = {}

with open('context/text_processing/groupid_paperid.csv', 'w') as fout:
    fout.write('group_id;items;users;paper_ids\n')
    for g in tqdm(G):
        texts[g] = ''
        num_confs = len(conference_list[g])
        for userid in users_list[g]:
            for conf in conference_list[g]:
                texts[g] = texts[g] + pb_by_user_id[userid][conf]
            if num_confs == 0:
                texts[g] = texts[g] + all_pb_by_user_id[userid]

        fout.write(str(g)+';'+str(items_list[g])+';'+str(users_list[g])+';'+texts[g][:-1]+'\n')
        
        texts[g] = ''


100%|██████████| 33302/33302 [00:04<00:00, 7395.15it/s] 
