In [1]:
MODEL_PATH = "/Users/bhaskar/meltwater/GoogleNews-vectors-negative300.bin"

In [2]:
import gensim
model = gensim.models.Word2Vec.load_word2vec_format(MODEL_PATH, binary=True)

In [71]:
dataset_list = ["/Users/bhaskar/meltwater/codeRepo/nlp-research/influencer-data/selected_influencers_50_2016-02-15",
               "/Users/bhaskar/meltwater/codeRepo/nlp-research/influencer-data/selected_influencers_30_2016-02-12"]

In [72]:
import json
import glob

def read_file(fpath):
    with open(fpath) as f:
        return json.load(f)
        
def read_data(path):
    return [read_file(fp) for fp in glob.glob(path + "/*.json")]

In [73]:
def record2str(beat, field1, field2):
    return "{}_{}".format(beat[field1], beat[field2].replace(" ", "-"))

In [74]:
def get_edition_beats_dict(inf_files):
    edition_beats_dict = {}
    for data in inf_files:
        influencer_id = data["influencer"]["id"]
        for edition in data["editions"]:
            ed_beats = set()
            for element in edition["beats"]:
                ed_beats.add(record2str(element, "id", "name"))
        edition_beats_dict[influencer_id] = [ed_beats]
    return edition_beats_dict

In [75]:
def get_influencer_role_id(inf_files):
    influencer_id_role = {}
    for data in inf_files:
        influencer_id = data["influencer"]["id"]
        role = set()
        for element in data["influencer"]["roles"] :
            role.add(record2str(element, "id", "name"))
        influencer_id_role[influencer_id] = role
    return influencer_id_role

In [76]:
def get_enrichment_per_influencer(data, enrichment_name):
    enrichment_dict = {}
    for element in data:
        influencer_id = element["influencer"]["id"]
        enrichment_set = set()
        for enrichment in element["results"][enrichment_name]:
            enrichment_set.add(enrichment["name"])
        enrichment_dict[influencer_id] = [enrichment_set]
    return enrichment_dict

In [77]:
def get_ground_truth_beat_dict(inf_files):
    ground_truth_beats_dict = {}
    for data in inf_files:
        influencer_id = data["influencer"]["id"]
        gt_beats = set()
        for beats in data["influencer"]["beats"]:
            gt_beats.add(record2str(beats, "id", "name"))
        ground_truth_beats_dict[influencer_id] = gt_beats
    return ground_truth_beats_dict

Computing relation between cateogory and ground truth child beats

In [104]:
import numpy
import re

def prepare_word(word):
    word_new = re.sub(r"[^\w\s]", ' ', word)
    word_new = ' '.join(word_new.split('_'))
    word_new = '_'.join(word_new.split())
    return word_new

def do_similarity_computation(ground_truth_beats_dict, category_dict):
    similarity_category = {}
    zero_match = 0
    for key in ground_truth_beats_dict.keys():
        gt_beats = ground_truth_beats_dict[key]
        category = category_dict[key]
        good_beats = []
        for beats in gt_beats:
            beats = (beats[beats.find('_') + 1:])
            beats_new = prepare_word(beats)
            for cat in category[0]:
                cat_new = prepare_word(cat)
                similarity_score = 0.0
                try:
                    similarity_score = model.similarity(beats_new.lower(), cat_new)
                except:
                    similarity_score = 0.0
                if similarity_score >= 0.5 :
                    good_beats.append(beats)
                    break
        similarity_category[key] = 100.0 * len(good_beats)/ len(gt_beats)
    for values in similarity_category.values():
        if values == 0.0:
            zero_match = zero_match + 1
    score = 0.0
    if len(similarity_category) > 0:
        score = 100.0 * zero_match / len(similarity_category)
    return score

zero_match_score = []
for dataset in dataset_list:
    print (dataset)
    inf_files = read_data(dataset)
    ground_truth_beats_dict = get_ground_truth_beat_dict(inf_files)
    category_dict = get_enrichment_per_influencer(inf_files, "parentCategories")
    zero_match_score.append(do_similarity_computation(ground_truth_beats_dict, category_dict))
print (zero_match_score)  


/Users/bhaskar/meltwater/codeRepo/nlp-research/influencer-data/selected_influencers_50_2016-02-15
/Users/bhaskar/meltwater/codeRepo/nlp-research/influencer-data/selected_influencers_30_2016-02-12
[46.808510638297875, 59.25925925925926]
