In [16]:
#DATA_PATH = "/Users/bhaskar/meltwater/codeRepo/nlp-research/influencer-data/2016-02-11"
dataset_list = ["/Users/bhaskar/meltwater/codeRepo/nlp-research/influencer-data/selected_influencers_50_2016-02-15",
               "/Users/bhaskar/meltwater/codeRepo/nlp-research/influencer-data/selected_influencers_30_2016-02-12"]

Functions to help with JSON

In [17]:
import json
import glob

def read_file(fpath):
    with open(fpath) as f:
        return json.load(f)
        
def read_data(path):
    return [read_file(fp) for fp in glob.glob(path + "/*.json")]

Reading all JSON in a list

In [18]:
#inf_files = read_data(DATA_PATH)

In [19]:
def record2str(beat, field1, field2):
    return "{}_{}".format(beat[field1], beat[field2].replace(" ", "-"))

In [20]:
def get_edition_beat_dict(inf_files):
    edition_beats_dict = {}
    for data in inf_files:
        influencer_id = data["influencer"]["id"]
        for edition in data["editions"]:
            ed_beats = set()
            for element in edition["beats"]:
                ed_beats.add(record2str(element, "id", "name"))
        edition_beats_dict[influencer_id] = [ed_beats]
    return edition_beats_dict

In [21]:
def get_influencer_id_role(inf_files):
    influencer_id_role = {}
    for data in inf_files:
        influencer_id = data["influencer"]["id"]
        role = set()
        for element in data["influencer"]["roles"] :
            role.add(record2str(element, "id", "name"))
        influencer_id_role[influencer_id] = role
    return influencer_id_role

We have now the following data structures:
1. edition beasts per influencer
2. role per influencer

We need:
1. aggregated keyphrase per influencer
2. aggregated categories per influencer
3. aggregated ground truth child_beats per influencer

In [22]:
def getEnrichmentPerInfluencer(data, enrichment_name):
    enrichment_dict = {}
    for element in data:
        influencer_id = element["influencer"]["id"]
        enrichment_set = set()
        for enrichment in element["results"][enrichment_name]:
            enrichment_set.add(enrichment["name"])
        enrichment_dict[influencer_id] = [enrichment_set]
    return enrichment_dict

#keyphrase_dict = getEnrichmentPerInfluencer(inf_files, "keyphrases")
#category_dict = getEnrichmentPerInfluencer(inf_files, "parentCategories")

In [23]:
def get_ground_truth_beats_dict(inf_files):
    ground_truth_beats_dict = {}
    for data in inf_files:
        influencer_id = data["influencer"]["id"]
        gt_beats = set()
        for beats in data["influencer"]["beats"]:
            gt_beats.add(record2str(beats, "id", "name"))
        ground_truth_beats_dict[influencer_id] = gt_beats
    return ground_truth_beats_dict

We got all the data that we will need for the experiment

Checking the similarity between edition_beats and ground truth child beat

In [24]:
def robust_intersect(*args):
    try:
        return set.intersection(*args)
    except:
        return {}

In [25]:
def get_role_all_match(influencer_id_role, edition_beats_dict, ground_truth_beats_dict):
    result_sim_ed_beat_gt_beat = {}
    all_match_percent_match_role = []
    for key in influencer_id_role:
        if key in edition_beats_dict:
            ed_beats = edition_beats_dict[key][0]
            gt_beats = ground_truth_beats_dict[key]
            if len(gt_beats) != 0:
                intersection = set.intersection(ed_beats, gt_beats)
                result_sim_ed_beat_gt_beat[key] = 100.0 * len(intersection)/len(gt_beats)
                if result_sim_ed_beat_gt_beat[key] == 100.0 :
                    all_match_percent_match_role.append(influencer_id_role[key])
    return result_sim_ed_beat_gt_beat, all_match_percent_match_role

Computing some stats

In [26]:
def get_stats(result_sim_ed_beat_gt_beat) :
    num_zero = 0
    num_100 = 0
    num_more_50 = 0
    for values in result_sim_ed_beat_gt_beat.values() :
        if values == 0.0:
            num_zero = num_zero + 1
        elif values == 100.0:
            num_100 = num_100 + 1
        elif values >= 50.0 :
            num_more_50 = num_more_50 + 1
    num_zero_match = 0
    num_100_match = 0
    num_50_100_match = 0
    if len(result_sim_ed_beat_gt_beat.keys()) != 0:
        num_zero_match = 100.0 * num_zero/len(result_sim_ed_beat_gt_beat.keys())
        num_100_match = 100.0 * num_100/len(result_sim_ed_beat_gt_beat.keys())
        num_50_100_match = 100.0 * num_more_50/len(result_sim_ed_beat_gt_beat.keys())
        print ("No match:", 100.0 * num_zero/len(result_sim_ed_beat_gt_beat.keys()),
           "100% match:", 100.0 * num_100/len(result_sim_ed_beat_gt_beat.keys()),
          ">=50% < 100% match:", 100.0 * num_more_50/len(result_sim_ed_beat_gt_beat.keys()))
    return [num_zero, num_100, num_50_100_match]

for dataset in dataset_list:
    inf_files = read_data(dataset)
    edition_beat_dict = get_edition_beat_dict(inf_files)
    influencer_role_id = get_influencer_id_role(inf_files)
    ground_truth_beat_dict = get_ground_truth_beats_dict(inf_files)
    result_sim_ed_beat_gt_beat, all_match_role = get_role_all_match(influencer_role_id, edition_beat_dict, ground_truth_beat_dict)
    print (all_match_role)
    print (get_stats(result_sim_ed_beat_gt_beat))


[{'4_Blogger', '51_Managing-Editor'}, {'46_Associate-Editor'}, {'9_Editor'}, {'33_Online-Editor', '28_Reporter'}, {'28_Reporter', '15_Host'}, {'13_Freelance-Journalist'}, {'51_Managing-Editor'}]
No match: 38.297872340425535 100% match: 14.893617021276595 >=50% < 100% match: 17.02127659574468
[18, 7, 17.02127659574468]
[{'4_Blogger', '51_Managing-Editor'}, {'46_Associate-Editor'}, {'13_Freelance-Journalist'}, {'9_Editor'}, {'33_Online-Editor', '28_Reporter'}]
No match: 33.333333333333336 100% match: 18.51851851851852 >=50% < 100% match: 14.814814814814815
[9, 5, 14.814814814814815]
