<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-all-submissions-to-dictionaries" data-toc-modified-id="Load-all-submissions-to-dictionaries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load all submissions to dictionaries</a></span></li><li><span><a href="#Select-fixed-number-of-articles,-baseed-on-10-that-appear-in-all" data-toc-modified-id="Select-fixed-number-of-articles,-baseed-on-10-that-appear-in-all-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Select fixed number of articles, baseed on 10 that appear in all</a></span></li><li><span><a href="#Get-all-across-submissions" data-toc-modified-id="Get-all-across-submissions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get all across submissions</a></span></li><li><span><a href="#Write-out-to-file---All-counts-across-CSV" data-toc-modified-id="Write-out-to-file---All-counts-across-CSV-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Write out to file - All counts across CSV</a></span></li><li><span><a href="#Write-out-to-file---By-publication-groupings" data-toc-modified-id="Write-out-to-file---By-publication-groupings-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Write out to file - By publication groupings</a></span></li><li><span><a href="#Write-out-to-file---Sampled-publications" data-toc-modified-id="Write-out-to-file---Sampled-publications-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Write out to file - Sampled publications</a></span></li></ul></div>

In [1]:
import json
import os
from collections import Counter
import random
import csv
from shutil import copyfile

In [2]:
#inputs_dir = './methods_fields_mentions/'

# dev fold results folder.
inputs_dir = "/work/evaluate/rcc-14/2018.11.19/results"
project_label = [i for i in inputs_dir.split('/') if 'rcc' in i][0]
files_path = '/work/evaluate/data/holdout/data/input/files/'

# holdout fold results folder.
#inputs_dir = "/work/evaluate/rcc-14/2018.11.19/results"
# project_label = "rcc-14"
#files_path = '/work/evaluate/data/input/files'

#all_submissions = [ ( f, f ) for f in os.listdir(inputs_dir) if not f.startswith('.')]
all_submissions = [ ( inputs_dir, project_label ) ]

# configs for output
#output_main_dir = './evaluate'
output_main_dir = '/work/evaluate/rcc-14/2018.11.19/evaluate/qualitative'

TOTAL_RANDOM_PUBS = 5
TOTAL_NUM_PUBS = 10

In [3]:
# function to create a path if it does not exist
def make_path(path_to_make):
    if not os.path.exists(path_to_make):
        print("Creating path {}".format(path_to_make))
        os.makedirs(path_to_make)
    else:
        print("{} already exists".format(path_to_make))

    return path_to_make


def clean_text(s):
    '''
        Return a cleaned string so they're easier to compare.
    '''
    s = s.lower()
    return s


def get_all(data_file, input_type):
    data = json.loads(open(data_file, 'r').read())
    all_found = [clean_text(i[input_type]) for i in data]
    return all_found


def get_all_by_pub(data_file, input_type):
    data = json.loads(open(data_file, 'r').read())

    all_found_by_pub = {}
    for i in data:
        all_found_by_pub[i['publication_id']] = []
    
    for i in data:
        all_found_by_pub[i['publication_id']].append(clean_text(i[input_type]))
   
    # sort each list
    for k, v in all_found_by_pub.items():
        all_found_by_pub[k] = sorted(v)
        
    return all_found_by_pub


def get_random_pubs(data, num):
    random_pub_listing = {}
    for k, v in data.items():
        available_list = list(set(v['by_pubs'].keys()))
        random_pub_listing[k] = random.sample(available_list, num) 
    return random_pub_listing


def get_all_common_pubids(data):
    '''selects the set intersection of all publications in all submissions for a submissions task'''
    # all_pubs_separate = [v['by_pubs'] for k, v in data.items() if v['by_pubs'] is not []]
    all_pubs_separate = [v['by_pubs'] for k, v in data.items() if v['by_pubs'] is not []]
    common_pubs = list(set(all_pubs_separate[0]).intersection(*all_pubs_separate))
    common_pubs_normalized = [int(i) for i in common_pubs]
        
    return common_pubs_normalized

def select_set_of_n_pubs(pub_lists, num):
    '''given a set number, look at all list and select num results that occur across all'''
    pub_lists = [l for l in pub_lists if len(l) > 0]
    top_num = list(set(pub_lists[0]).intersection(*pub_lists))
    return top_num[:num]

def get_all_across_submissions(submission_data):
    all_counts_across = {}
    per_pub_across = {}
    for k, v in submission_data.items():
        for i in v['all_counts']:
            if i[0] not in all_counts_across.keys():
                all_counts_across[i[0]] = i[1]
            else:
                all_counts_across[i[0]] += i[1]
        
        for k, v in v['by_pubs'].items():
            if k not in per_pub_across.keys():
                per_pub_across[k] = v
            else:
                per_pub_across[k].extend(v)
                
        for k, v in per_pub_across.items():
            per_pub_across[k] = sorted(v)
            
    all_counts_across = sorted(all_counts_across.items(), key=lambda kv: kv[1], reverse=True)
    per_pub_across = sorted(per_pub_across.items(), key=lambda kv: int(kv[0]))
    return (all_counts_across, per_pub_across)
        
    
def write_to_csv(path, file, header, data):
    filepath = os.path.join(path, file)
    with open(filepath, 'w', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header)
        for row in data:
            writer.writerow(row)
            
            
def write_to_json(path, file, data):
    filepath = os.path.join(path, file)
    with open(filepath, 'w', encoding='utf-8') as jsonfile:
        json.dump(data, jsonfile, indent=4)
        
        
def write_list_to_txt(path, file, list_of_items):
    filepath = os.path.join(path, file)
    with open(filepath, 'w', encoding='utf-8') as txtfile:
        for item in list_of_items:
            txtfile.write("{}\n".format(item))
            
def generate_scoring_sheet(file_path, pub_list):
    
    header = ['', 'fields', 'methods', 'mentions']
    row_one = ['submission_overall_frequencies (1-5)', '', '', '']
    with open(file_path, 'w') as ofile:
        writer = csv.writer(ofile)
        writer.writerow(header)
        writer.writerow(row_one)
        for pub in pub_list:
            col_val = "{} (-1, 0, 1)".format(pub)
            nextrow = [col_val, '', '', '']
            writer.writerow(nextrow)
        writer.writerow(['', '', '', ''])
        writer.writerow(['totals', '', '', ''])

### Load all submissions to dictionaries

In [4]:
all_fields = {}
all_methods = {}
all_mentions = {}
for submission_tuple in all_submissions:
    
    submission_dir = submission_tuple[ 0 ]
    project_label = submission_tuple[ 1 ]
    
    # instantiate place to store all result data by submission type
    all_fields[project_label] = {}
    all_methods[project_label] = {}
    all_mentions[project_label]= {}
    
    # build the paths to the submission files
    fields_data = os.path.join(inputs_dir, submission_dir, 'research_fields.json')
    methods_data = os.path.join(inputs_dir, submission_dir, 'methods.json')
    try:
        mentions_data = os.path.join(inputs_dir, submission_dir, 'data_set_mentions.json')
        with open(mentions_data, 'r') as f:
            pass
    except:
        mentions_data = os.path.join(inputs_dir, submission_dir, 'dataset_mentions.json')
    #-- try...except --#
    
    # get all the data for each submission type
    curr_fields = get_all(fields_data, 'research_field')
    curr_methods = get_all(methods_data, 'method')
    curr_mentions = get_all(mentions_data, 'mention')
    
    # create a count of all results for a submission type
    curr_fields_count = dict(Counter(curr_fields))
    curr_methods_count = dict(Counter(curr_methods))
    curr_mentions_count = dict(Counter(curr_mentions))
    
    # Sort the list of counts
    all_curr_fields_counts = sorted(curr_fields_count.items(), key=lambda kv: kv[1], reverse=True)
    all_curr_methods_counts = sorted(curr_methods_count.items(), key=lambda kv: kv[1], reverse=True)
    all_curr_mentions_counts = sorted(curr_mentions_count.items(), key=lambda kv: kv[1], reverse=True)
    
    # store all results for a publication with that publication
    curr_fields_by_pub = get_all_by_pub(fields_data, 'research_field')
    curr_methods_by_pub = get_all_by_pub(methods_data, 'method')
    curr_mentions_by_pub = get_all_by_pub(mentions_data, 'mention')
    
    # store the counts of everything and by publication list in the dictionary store for each submission
    all_fields[project_label]['all_counts'] = all_curr_fields_counts
    all_fields[project_label]['by_pubs'] = curr_fields_by_pub
    all_methods[project_label]['all_counts'] = all_curr_methods_counts
    all_methods[project_label]['by_pubs'] = curr_methods_by_pub
    all_mentions[project_label]['all_counts'] = all_curr_mentions_counts
    all_mentions[project_label]['by_pubs'] = curr_mentions_by_pub

In [5]:
all_fields

{'rcc-14': {'all_counts': [], 'by_pubs': {}}}

In [6]:
all_mentions

{'rcc-14': {'all_counts': [('national youth survey', 85),
   ('national violence against women survey', 79),
   ('civil war', 75),
   ('chap', 69),
   ('add up', 66),
   ('ethos', 58),
   ('the cold', 45),
   ('foreign direct investment', 42),
   ('the cold war', 40),
   ('add to', 21),
   ('corporate bond and cds markets', 19),
   ("fbi's", 19),
   ('american national election study', 18),
   ('national longitudinal study of youth', 17),
   ('handicap', 15),
   ('kappa', 15),
   ('violent crime and the spatial dynamics of neighborhood transition', 14),
   ('cap', 14),
   ('national education longitudinal study', 13),
   ('bridging gaps in police crime data', 13),
   ('national intimate partner and sexual violence survey', 13),
   ('communities and adolescent violence', 12),
   ('bsc', 11),
   ('add little', 10),
   ('police involvement in domestic violence', 10),
   ('uniform crime reporting (ucr)', 10),
   ('duh', 9),
   ('international financial competitiveness and incentives to for

In [7]:
all_methods

{'rcc-14': {'all_counts': [], 'by_pubs': {}}}

### Select fixed number of articles, baseed on 10 that appear in all

In [8]:
# methods_random = get_random_pubs(all_methods, TOTAL_RANDOM_PUBS)
# fields_random = get_random_pubs(all_fields, TOTAL_RANDOM_PUBS)
# mentions_random = get_random_pubs(all_mentions, TOTAL_RANDOM_PUBS)
methods_pubs = get_all_common_pubids(all_methods)
fields_pubs = get_all_common_pubids(all_fields)
mentions_pubs = get_all_common_pubids(all_mentions)

In [9]:
sorted(methods_pubs)

[]

In [10]:
sorted(fields_pubs)

[]

In [11]:
sorted(mentions_pubs)

[103,
 106,
 107,
 112,
 114,
 120,
 121,
 126,
 128,
 129,
 130,
 132,
 136,
 138,
 139,
 140,
 141,
 144,
 146,
 151,
 152,
 154,
 156,
 157,
 159,
 161,
 2817,
 2822,
 2825,
 2828,
 2830,
 2831,
 2833,
 2837,
 2838,
 2840,
 2841,
 2843,
 2845,
 2847,
 2848,
 2851,
 2852,
 2854,
 2855,
 2856,
 2857,
 2858,
 2859,
 2860,
 2867,
 2870,
 2873,
 2875,
 2877,
 2878,
 2883,
 2884,
 2885,
 2887,
 2889,
 2895,
 2897,
 2899,
 2912,
 2918,
 2919,
 2921,
 2925,
 2929,
 2940,
 2942,
 2943,
 2945,
 2949,
 2950,
 2952,
 2954,
 2962,
 2963,
 3159,
 3161,
 3162,
 5713,
 5714,
 5715,
 5717,
 5719,
 5721,
 5722,
 5723,
 5724,
 5725,
 5727,
 5729,
 5731,
 5732,
 5733,
 5734,
 5737,
 5738,
 5741,
 5742,
 5744,
 5749,
 5750,
 5751,
 5754,
 5755,
 5756,
 5758,
 5760,
 5761,
 5762,
 5763,
 5764,
 5765,
 5769,
 5770,
 5771,
 5772,
 5774,
 5775,
 5776,
 5777,
 5778,
 5780,
 5781,
 5782,
 5783,
 5784,
 5787,
 5789,
 5791,
 5792,
 5795,
 5796,
 5797,
 5799,
 5803,
 5805,
 5807,
 5808,
 5810,
 5812,
 5815,
 581

In [12]:
# pub_selection = select_set_of_n_pubs([methods_pubs, fields_pubs, mentions_pubs],TOTAL_NUM_PUBS)
pub_selection = select_set_of_n_pubs([methods_pubs, fields_pubs, mentions_pubs],TOTAL_NUM_PUBS)

In [13]:
pub_selection

[8192, 8193, 8196, 8199, 8201, 8202, 8203, 8205, 8207, 8215]

### Get all across submissions

In [14]:
methods_counts_across, methods_bypub_across = get_all_across_submissions(all_methods)
fields_counts_across, fields_bypub_across = get_all_across_submissions(all_fields)
mentions_counts_across, mentions_bypub_across = get_all_across_submissions(all_mentions)

### Write out to file - All counts across CSV

In [15]:
# write out the all counts across
mentions_path = make_path(os.path.join(output_main_dir, 'mentions'))
methods_path = make_path(os.path.join(output_main_dir, 'methods'))
fields_path = make_path(os.path.join(output_main_dir, 'fields'))

/work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions already exists
/work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods already exists
/work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields already exists


In [16]:
write_to_csv(mentions_path, 'mentions_all_counts_across.csv' ,['mention', 'count'], mentions_counts_across)
write_to_csv(methods_path, 'methods_all_counts_across.csv', ['method', 'count'], methods_counts_across)
write_to_csv(fields_path, 'fields_all_counts_across.csv' ,['field', 'count'], fields_counts_across)

### Write out to file - By publication groupings

In [17]:
write_to_json(mentions_path, 'mentions_bypub_across.json', dict(mentions_bypub_across))
write_to_json(methods_path, 'methods_bypub_across.json', dict(methods_bypub_across))
write_to_json(fields_path, 'fields_bypub_across.json', dict(fields_bypub_across))

### Write out to file - Sampled publications

In [18]:
# FIELDS

for k, v in all_fields.items():
    field_output_dir = make_path(os.path.join(fields_path, k))
    csvpath = '{}_fields_counts.csv'.format(k)
    jsonpath = '{}_fields_bypub.json'.format(k)
    # write the scoring sheet out once
    scoring_path = os.path.join(output_main_dir, '{}_judges_scoring_sheet.csv'.format(k)) 
    generate_scoring_sheet(scoring_path, pub_selection)
    
    write_to_csv(field_output_dir, csvpath, ['field', 'count'], v['all_counts'])
    write_to_json(field_output_dir, jsonpath, v['by_pubs'])
    
    #for pubid in fields_random[k]:
    for pubid in pub_selection:
        pubdir = make_path(os.path.join(field_output_dir, str(pubid)))
        txtfile = '{}_fields.txt'.format(str(pubid))
        fields_list = v['by_pubs'].get(pubid, None)
        
        if fields_list is not None:
            write_list_to_txt(pubdir, txtfile, fields_list)
            copyfile('{}/text/{}.txt'.format(files_path, str(pubid)), os.path.join(pubdir, '{}.txt'.format(str(pubid))))
            copyfile('{}/pdf/{}.pdf'.format(files_path, str(pubid)), os.path.join(pubdir, '{}.pdf'.format(str(pubid))))

/work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14 already exists
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8192
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8193
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8196
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8199
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8201
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8202
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8203
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8205
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8207
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/fields/rcc-14/8215


In [19]:
# METHODS

for k, v in all_methods.items():
    method_output_dir = make_path(os.path.join(methods_path, k))
    csvpath = '{}_methods_counts.csv'.format(k)
    jsonpath = '{}_methods_bypub.json'.format(k)
    
    write_to_csv(method_output_dir, csvpath, ['method', 'count'], v['all_counts'])
    write_to_json(method_output_dir, jsonpath, v['by_pubs'])
    
    #for pubid in methods_random[k]:
    for pubid in pub_selection:
        pubdir = make_path(os.path.join(method_output_dir, str(pubid)))
        txtfile = '{}_methods.txt'.format(str(pubid))
        methods_list = v['by_pubs'].get(pubid, None)
        if methods_list is not None:
            write_list_to_txt(pubdir, txtfile, methods_list)
            copyfile('{}/text/{}.txt'.format(files_path, str(pubid)), os.path.join(pubdir, '{}.txt'.format(str(pubid))))
            copyfile('{}/pdf/{}.pdf'.format(files_path, str(pubid)), os.path.join(pubdir, '{}.pdf'.format(str(pubid))))

/work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14 already exists
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8192
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8193
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8196
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8199
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8201
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8202
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8203
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8205
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8207
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/methods/rcc-14/8215


In [20]:
# MENTIONS

for k, v in all_mentions.items():
    mention_output_dir = make_path(os.path.join(mentions_path, k))
    csvpath = '{}_mentions_counts.csv'.format(k)
    jsonpath = '{}_mentions_bypub.json'.format(k)
    
    write_to_csv(mention_output_dir, csvpath, ['mention', 'count'], v['all_counts'])
    write_to_json(mention_output_dir, jsonpath, v['by_pubs'])
    
    # for pubid in mentions_random[k]:
    for pubid in pub_selection:
        pubdir = make_path(os.path.join(mention_output_dir, str(pubid)))
        txtfile = '{}_mentions.txt'.format(str(pubid))
        mentions_list = v['by_pubs'].get(pubid, None)
        if mentions_list is not None:
            write_list_to_txt(pubdir, txtfile, mentions_list)
            copyfile('{}/text/{}.txt'.format(files_path, str(pubid)), os.path.join(pubdir, '{}.txt'.format(str(pubid))))
            copyfile('{}/pdf/{}.pdf'.format(files_path, str(pubid)), os.path.join(pubdir, '{}.pdf'.format(str(pubid))))

/work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14 already exists
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8192
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8193
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8196
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8199
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8201
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8202
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8203
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8205
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8207
Creating path /work/evaluate/rcc-14/2018.11.19/evaluate/qualitative/mentions/rcc-14/8215
