In [41]:
# Append and deduplicate partitions

import os
import json


def flatten(l):
    sl = [item for sublist in l for item in sublist]
    return sl

def read_partitions(partitions):
    pub_list = []
    for p in partitions:
        with open(p) as json_file:
            partition = json.load(json_file)
            pub_list.append(partition)
    pub_list = flatten(pub_list)
    return pub_list




import collections
from collections import defaultdict
import pandas as pd

def combine_metadata(dics):
    dd = defaultdict(list)
    for dic in dics:
        for key, val in dic.items():  
            if not pd.isnull(val):
                dd[key].append(val)
#                 dd[key].append(val.lower())
                dd[key] = list(set(dd[key]))
            else: 
                continue
    return dict(dd)

def merge_metadata(dicts):
    res = collections.defaultdict(list)
    md_list = []
    for d in dicts:
        for k, v in d.items():
            if k  == 'original':
                md_list.append(v)
            if k == 'title':
                res[k].append(v)
            if k == 'datasets':
                res[k].append(v)
    res['original']  = combine_metadata(md_list)
    res['title'] = list(set(res['title']))[0]
    res['datasets'] = list(set(flatten(res['datasets'])))
    return dict(res)


#### Account for duplicates, merging any metadata into one subdictionary ('original')


In [80]:


# read in partitions


partitions = [os.path.join('/Users/sophierand/RCPublications/partitions',f) for f in os.listdir('/Users/sophierand/RCPublications/partitions') if f.endswith('.json')]
pub_list_flat = read_partitions(partitions)



In [81]:
pub_list_flat

test = [p for p in pub_list_flat if p['title']=='USDA Special Supplemental Nutrition Program for Women, Infants, and Children (WIC): A New Look at Key Questions 10 Years After USDA Added Whole-Grain Bread to WIC Food Packages in 2009']
test

[{'title': 'USDA Special Supplemental Nutrition Program for Women, Infants, and Children (WIC): A New Look at Key Questions 10 Years After USDA Added Whole-Grain Bread to WIC Food Packages in 2009',
  'datasets': ['dataset-025'],
  'original': {'url': 'https://www.ers.usda.gov/publications/pub-details/?pubid=93650',
   'sheet_name': 'FoodAPS',
   'original_dataset_name': 'FoodAPS',
   'date_added': '2019-10-11 11:13:08'}},
 {'title': 'USDA Special Supplemental Nutrition Program for Women, Infants, and Children (WIC): A New Look at Key Questions 10 Years After USDA Added Whole-Grain Bread to WIC Food Packages in 2009',
  'datasets': ['dataset-025'],
  'original': {'url': 'https://www.ers.usda.gov/publications/pub-details/?pubid=93650',
   'sheet_name': 'IRI',
   'original_dataset_name': '▪IRI',
   'date_added': '2019-10-11 11:13:08'}}]

In [84]:
groups = {}
for d in pub_list_flat:
    if d['title'] not in groups:
        groups[d['title']] = {'datasets': d['datasets'], 'original': d['original']}
    else:
        for ds in d['datasets']:
            groups[d['title']]['datasets'].append(ds)
        groups[d['title']]['datasets'] = list(set(groups[d['title']]['datasets']))
        groups[d['title']]['original'].update(d['original'])
result = [{**{'title': k}, **v} for k, v in groups.items()]

In [95]:
result

[{'title': 'Supply-side subsidies to improve food access and dietary outcomes: Evidence from the New Markets Tax Credit',
  'datasets': ['dataset-002', 'dataset-001'],
  'original': {'doi': '10.1177/0042098017740285',
   'dataset_name': 'IRI Consumer Network',
   'journal': nan,
   'url': 'https://www.doi.org/10.1177/0042098017740285',
   'date_added': '2019-10-10 13:19:04',
   'keywords': 'IRI'}},
 {'title': 'The healthfulness of food and beverage purchases after the federal food package revisions: The case of two New England states',
  'datasets': ['dataset-002', 'dataset-001'],
  'original': {'doi': '10.1016/j.ypmed.2016.08.018',
   'dataset_name': 'IRI Consumer Network',
   'journal': nan,
   'url': 'https://www.doi.org/10.1016/j.ypmed.2016.08.018',
   'date_added': '2019-10-10 13:19:04',
   'keywords': 'IRI'}},
 {'title': 'Relationships between Diet, Alcohol Preference, and Heart Disease and Type 2 Diabetes among Americans',
  'datasets': ['dataset-002', 'dataset-001'],
  'origina

In [94]:
n_titles = len(result)
n_dupes = len(pub_list_flat) - n_titles
n_dupes

272

In [93]:
n_titles

1823

In [46]:

# create counter on title for deduplications
from collections import Counter

ct = Counter(i['title'] for i in pub_list_flat)

# deduplicate by title
new_l = []
for i,v in ct.items():
    if v > 1:
        dup_pubs = [p for p in pub_list_flat if p['title'] == i]
        dup_pubs_merged = merge_metadata(dup_pubs)
        new_l.append(dup_pubs_merged)
    if v == 1:
        unique_pubs = [p for p in pub_list_flat if p['title'] == i]
        new_l.append(unique_pubs)
# new_l = flatten(new_l)

In [49]:
dup_pubs_merged

{'title': 'USDA Special Supplemental Nutrition Program for Women, Infants, and Children (WIC): A New Look at Key Questions 10 Years After USDA Added Whole-Grain Bread to WIC Food Packages in 2009',
 'datasets': ['dataset-025'],
 'original': {'url': ['https://www.ers.usda.gov/publications/pub-details/?pubid=93650'],
  'sheet_name': ['IRI', 'FoodAPS'],
  'original_dataset_name': ['▪IRI', 'FoodAPS'],
  'date_added': ['2019-10-11 11:13:08']}}

In [47]:
new_l

[{'title': 'Supply-side subsidies to improve food access and dietary outcomes: Evidence from the New Markets Tax Credit',
  'datasets': ['dataset-002', 'dataset-001'],
  'original': {'doi': ['10.1177/0042098017740285'],
   'dataset_name': ['IRI Consumer Network'],
   'journal': ['Urban Studies'],
   'url': ['https://www.doi.org/10.1177/0042098017740285'],
   'date_added': ['2019-10-14 16:34:18', '2019-10-10 13:19:04'],
   'keywords': ['IRI']}},
 {'title': 'The healthfulness of food and beverage purchases after the federal food package revisions: The case of two New England states',
  'datasets': ['dataset-002', 'dataset-001'],
  'original': {'doi': ['10.1016/j.ypmed.2016.08.018'],
   'dataset_name': ['IRI Consumer Network'],
   'journal': ['Preventive Medicine'],
   'url': ['https://www.doi.org/10.1016/j.ypmed.2016.08.018'],
   'date_added': ['2019-10-14 16:34:18', '2019-10-10 13:19:04'],
   'keywords': ['IRI']}},
 {'title': 'Relationships between Diet, Alcohol Preference, and Heart Di

In [42]:


# create df from counter to print # of duplicates
df = pd.DataFrame.from_dict(ct, orient='index').reset_index()
df = df.rename(columns={'index':'title', 0:'count'})
nduplicate_titles = df.loc[df['count'] > 1].title.nunique()
print('Among all publication partitions, there were {} duplicate titles - they have been deduplicated, their metadata merged and exported to publications.json'.format(nduplicate_titles))

json_pub_path = '/Users/sophierand/RCPublications/publications.json'
with open(json_pub_path, 'w') as outfile:
    json.dump(new_l, outfile, indent=2)
    
    
n_unique_titles = df.title.nunique()
print('There were {} unique publications found in partitions which have been compiled and exported to publications.json'.format(n_unique_titles))



Among all publication partitions, there were 196 duplicate titles - they have been deduplicated, their metadata merged and exported to publications.json
There were 1823 unique publications found in partitions which have been compiled and exported to publications.json


In [32]:

partitions = [os.path.join('/Users/sophierand/RCPublications/partitions',f) for f in os.listdir('/Users/sophierand/RCPublications/partitions') if f.endswith('.json')]
pub_list_flat = read_partitions(partitions)



In [33]:

# create counter on title for deduplications
from collections import Counter

ct = Counter(i['title'] for i in pub_list_flat)

# deduplicate by title
new_l = []
for i,v in ct.items():
    if v > 1:
        dup_pubs = [p for p in pub_list_flat if p['title'] == i]
        dup_pubs_merged = merge_metadata(dup_pubs)
        new_l.append(dup_pubs_merged)
    if v == 1:
        unique_pubs = [p for p in pub_list_flat if p['title'] == i]
        new_l.append(unique_pubs)

# create df from counter to print # of duplicates


In [35]:
df = pd.DataFrame.from_dict(ct, orient='index').reset_index()
df = df.rename(columns={'index':'title', 0:'count'})
nduplicate_titles = df.loc[df['count'] > 1].title.nunique()
nduplicate_titles

196

In [39]:
print('Among all publication partitions, there were {} duplicate titles - they have been deduplicated, their metadata merged and exported to publications.json'.format(str(nduplicate_titles)))

Among all publication partitions, there were 196 duplicate titles - they have been deduplicated, their metadata merged and exported to publications.json


In [None]:
print('Among all publication partitions, there were {} duplicate titles - they have been deduplicated, their metadata merged and exported to publications.json').format(nduplicate_titles)

json_pub_path = '/Users/sophierand/RCPublications/publications.json'
with open(json_pub_path, 'w') as outfile:
    json.dump(new_l, outfile, indent=2)
    
    
n_unique_titles = df.title.nunique()
print('There were {} unique publications found in partitions which have been compiled and exported to publications.json').format(n_unique_titles)



In [40]:
# new_l


[{'title': 'Supply-side subsidies to improve food access and dietary outcomes: Evidence from the New Markets Tax Credit',
  'datasets': ['dataset-002', 'dataset-001'],
  'original': {'doi': ['10.1177/0042098017740285'],
   'dataset_name': ['IRI Consumer Network'],
   'journal': ['Urban Studies'],
   'url': ['https://www.doi.org/10.1177/0042098017740285'],
   'date_added': ['2019-10-14 16:34:18', '2019-10-10 13:19:04'],
   'keywords': ['IRI']}},
 {'title': 'The healthfulness of food and beverage purchases after the federal food package revisions: The case of two New England states',
  'datasets': ['dataset-002', 'dataset-001'],
  'original': {'doi': ['10.1016/j.ypmed.2016.08.018'],
   'dataset_name': ['IRI Consumer Network'],
   'journal': ['Preventive Medicine'],
   'url': ['https://www.doi.org/10.1016/j.ypmed.2016.08.018'],
   'date_added': ['2019-10-14 16:34:18', '2019-10-10 13:19:04'],
   'keywords': ['IRI']}},
 {'title': 'Relationships between Diet, Alcohol Preference, and Heart Di

### To Do
* update for additional nested metadata fields
* create that building block - get API metadata on each of the partitions so that its `python api.py <filename>_publications.json` 
* then second step is below - stacking, deduplication and formatting
* `python test.py` should be the final step

In [30]:
!jupyter nbconvert --to script append_partitions.ipynb

[NbConvertApp] Converting notebook append_partitions.ipynb to script
[NbConvertApp] Writing 2913 bytes to append_partitions.py


In [1]:
import os

#### Import and stack the partitions

In [2]:
partitions = [os.path.join('/Users/sophierand/RCPublications/partitions',f) for f in os.listdir('/Users/sophierand/RCPublications/partitions') if f.endswith('.json')]
# partitions                                               

In [3]:
import json
pub_list = []
for p in partitions:
    with open(p) as json_file:
        partition = json.load(json_file)
        pub_list.append(partition)

In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [5]:
pub_list_flat = flatten(pub_list)

In [6]:
import collections
from collections import defaultdict
import pandas as pd

def combine_metadata(dics):
    dd = defaultdict(list)
    for dic in dics:
        for key, val in dic.items():  
            if not pd.isnull(val):
                dd[key].append(val)
#                 dd[key].append(val.lower())
                dd[key] = list(set(dd[key]))
            else: 
                continue
    return dict(dd)

def merge_metadata(dicts):
    res = collections.defaultdict(list)
    md_list = []
    for d in dicts:
        for k, v in d.items():
            if k  == 'original':
                md_list.append(v)
            if k == 'title':
                res[k].append(v)
            if k == 'datasets':
                res[k].append(v)
    res['original']  = combine_metadata(md_list)
    res['title'] = list(set(res['title']))[0]
    res['datasets'] = list(set(flatten(res['datasets'])))
    return dict(res)

#### Account for duplicates, merging any metadata into one subdictionary ('original')

In [7]:
from collections import Counter
ct = Counter(i['title'] for i in pub_list_flat)

In [8]:
new_l = []
for i,v in ct.items():
    if v > 1:
        dup_pubs = [p for p in pub_list_flat if p['title'] == i]
        dup_pubs_merged = merge_metadata(dup_pubs)
        new_l.append(dup_pubs_merged)
    if v == 1:
        unique_pubs = [p for p in pub_list_flat if p['title'] == i]
        new_l.append(unique_pubs)

In [29]:
df = pd.DataFrame.from_dict(ct, orient='index').reset_index()
df = df.rename(columns={'index':'title', 0:'count'})
nduplicate_titles = df.loc[df['count'] > 1].title.nunique()
nduplicate_titles

196

In [10]:
json_pub_path = '/Users/sophierand/RCPublications/publications.json'
with open(json_pub_path, 'w') as outfile:
    json.dump(new_l, outfile, indent=2)

In [10]:
new_l

[{'title': 'Supply-side subsidies to improve food access and dietary outcomes: Evidence from the New Markets Tax Credit',
  'datasets': ['dataset-002', 'dataset-001'],
  'original': {'doi': ['10.1177/0042098017740285'],
   'dataset_name': ['IRI Consumer Network'],
   'journal': ['Urban Studies'],
   'url': ['https://www.doi.org/10.1177/0042098017740285'],
   'date_added': ['2019-10-14 16:34:18', '2019-10-10 13:19:04'],
   'keywords': ['IRI']}},
 {'title': 'The healthfulness of food and beverage purchases after the federal food package revisions: The case of two New England states',
  'datasets': ['dataset-002', 'dataset-001'],
  'original': {'doi': ['10.1016/j.ypmed.2016.08.018'],
   'dataset_name': ['IRI Consumer Network'],
   'journal': ['Preventive Medicine'],
   'url': ['https://www.doi.org/10.1016/j.ypmed.2016.08.018'],
   'date_added': ['2019-10-14 16:34:18', '2019-10-10 13:19:04'],
   'keywords': ['IRI']}},
 {'title': 'Relationships between Diet, Alcohol Preference, and Heart Di

In [11]:
len(new_l)

1823