In [2]:
import csv
import community
import pandas as pd
import networkx as nx
import math
import matplotlib.pyplot as plt
import pickle
import requests
import sys
from bs4 import BeautifulSoup
import re
import os

## Parse Projects from saved soupfiles

In [8]:
#takes a filename with soup inside, retrives grantIDs and respective date
def getGrantListFromFile(filename):
    grantlist = list()
    with open(filename) as file:  
        tempstr = file.read()
        soup = BeautifulSoup(tempstr, 'lxml')
        grantIDtags = soup.find_all('code')
        for grantID in grantIDtags:
            countries = list()
            projecttag = grantID.parent
            relstag = projecttag.find('rels')
            for rel in relstag.children:
                country = rel.find('country')
                if (country <> -1) and (country is not None):
                    if country['classname'] not in countries:
                        countries.append(country['classname'])
            grantIDdate = 'not available'
            for sibling in grantID.previous_siblings:
                if sibling.name == 'startdate':
                    grantIDdate = sibling.text #need to parse the date
            for sibling in grantID.next_siblings:
                if sibling.name == 'startdate':
                    grantIDdate = sibling.text #need to parse the date
            grantlist.append([grantID.text,grantIDdate, countries]) 
    return grantlist;

## Parse Publications from saved soupfiles

In [13]:
#takes a filename with soup inside, returns a list of publications ids, with date, grantID, set type to "publications"
def getPubListFromFile(filename):
    publist = list()
    i = 0
    j = 0
    with open(filename) as file:  
        tempstr = file.read()
        soup = BeautifulSoup(tempstr, 'lxml')
        records = soup.find_all(re.compile("^oai:record"))
        print 'found ', len(records), ' records'
        for record in records:
            templist = list()
            acceptdate = record.find('dateofacceptance')
            if acceptdate.text == '':
                i = i + 1 #count records without a date
                continue #we do not add this record to the publication list
            instancetype = record.find('instancetype')
            pubtype=instancetype['classname']
            #print pubtype
            title = record.find('title')
            code = record.find('code')
            pid = record.find('pid')
            if pid.text == '':
                #print '--------------------------'
                j = j + 1 
                continue #we do not add this record to the publication list
            publist.append([pid.text, acceptdate.text, title.text, code.text, pubtype])
        print i, " records not included due to absence of date"
        print j, " records not included due to absence of doi"
    return publist;

In [20]:
#takes a filename with soup inside, returns a list of publications ids, with date, grantID, set type to "publications"
def getSoftListFromFile(filename):
    softlist = list()
    softdict = dict()
    i = 0
    j = 0
    with open(filename) as file:  
        tempstr = file.read()
        soup = BeautifulSoup(tempstr, 'lxml')
        records = soup.find_all(re.compile("^oaf:result"))
        print 'found ', len(records), ' results'
        for record in records:
            storagedate = record.find('storagedate')
            pid = record.find('pid')
            if (pid["classname"] == "doi") and (storagedate.text <> ""):
                code = record.find('code')
                doi = pid.text
                title = record.find('title')
                softlist.append([doi, storagedate.text, title.text, code.text, "software"])
                #print softlist[len(softlist)-1]
        print len(softlist)
        return softlist;

## Construct overall output lists for each output type

In [16]:
#construct a total output list for the specified type
def getTotalOutputList(outputType):
    directory = './' + outputType + 'soups/'
    totaloutputlist = list()
    for filename in os.listdir(directory):
        #print filename
        if outputType == 'projects':
            templist = getGrantListFromFile(directory + filename)
        if outputType == 'publications':
            templist = getPubListFromFile(directory + filename)
        if outputType == 'software':
            templist = getSoftListFromFile(directory + filename)
        totaloutputlist = totaloutputlist + templist
        print len(totaloutputlist)
    return totaloutputlist;

In [None]:
#possible values: projects, publications, software
totalgrantlist = getTotalOutputList('projects')

In [None]:
#possible values: projects, publications, software
totalsoftlist = getTotalOutputList('software')

In [None]:
#possible values: projects, publications, software
totalpublist = getTotalOutputList('publications')

In [28]:
#extract the dictionary of countries from the grantlist, where keys are grants and values are lists of countries for each grant
grantcountries = dict()
for grant in totalgrantlist:
    grantcountries[grant[0]] = grant[2]

## Export lists to pickle

In [None]:
#uncomment only when it is necessary to update the lists, otherwise will rewrite the currently saved pickles
'''pickle_out = open("grantcountries.pickle","wb")
pickle.dump(grantcountries, pickle_out)
pickle_out.close()

pickle_out = open("totalgrantlist.pickle","wb")
pickle.dump(totalgrantlist, pickle_out)
pickle_out.close()

pickle_out = open("totalsoftlist.pickle","wb")
pickle.dump(totalsoftlist, pickle_out)
pickle_out.close()

pickle_out = open("totalpublist.pickle","wb")
pickle.dump(totalpublist, pickle_out)
pickle_out.close()'''