In [None]:
'''data notes
nodelist length before graph - 261036
nodelist length after graph - 258619, duplicates were removed automatically by networkx
edgelist length before graph - 214843
edgelist length after graph - 213054
'''

In [None]:
import csv
import community
import pandas as pd
import networkx as nx
import math
import matplotlib.pyplot as plt
import pickle
import igraph as ig
import codecs

## Load saved nodelist and edgelist

In [None]:
pickle_in = open("nodelist.pickle","rb")
nodelist = pickle.load(pickle_in)

pickle_in = open("edgelist.pickle","rb")
edgelist = pickle.load(pickle_in)

In [None]:
len(nodelist)

## Create NetworkX graph

In [None]:
#populate graph with nodes and node attributes
G = nx.Graph()
nodeids = list()
for index, node in enumerate(nodelist):
    G.add_node(node[2], name=node[2], grant=node[4], type=node[1], year=node[5], countries = str(node[6]))

In [None]:
#add edges to the graph
for index, edge in enumerate(edgelist):
    source = edge[0]
    target = edge[1]
    G.add_edge(source,target)

## Export NetworkX graph to graphml format for later conversion to iGraph

We convert NetworkX graph to iGraph because calculation of centrality measures in iGraph is significantly faster, compared to NetworkX

In [None]:
nx.write_graphml(G,'graph_17122018.graphml') # Export NX graph to file

In [None]:
Gix = ig.read('graph_17122018.graphml',format="graphml") # Create new IG graph from file

## Calculate SNA measures for all nodes of the network

In [None]:
#network density
density = Gix.density(loops=False)

In [None]:
#betweenness centrality
betw = Gix.betweenness(vertices=None, directed=False, cutoff=None, weights=None, nobigint=True)

In [None]:
#eigenvector centrality
eigen = Gix.eigenvector_centrality(directed=False, scale=True, weights=None, return_eigenvalue=False)

In [None]:
#closeness centrality
closeness = Gix.closeness(vertices=None, mode='ALL', cutoff=None, weights=None, normalized=True)

In [None]:
#create a mapping between G names and Gix indices
namesGix = Gix.vs['name']
nodesdict = dict()
for name in namesGix:
    nodesdict[name] = namesGix.index(name)
    print name, nodesdict[name]

In [None]:
#create mapping between nodes and indices of G
'''nodesdict = dict()
counter = 0
for index,node in G.nodes(data=True):
    nodesdict[index] = counter
    counter = counter + 1
'''

## Calculate SNA measures for components

In [None]:
#find all components in the graph
components = nx.connected_components(G)
complist = list(components)

In [None]:
#leave only components that are larger than 200
largecomplist = list()
for component in complist:
    if len(component)>200:
        largecomplist.append(component)
complist = largecomplist

In [None]:
#calculate SNA measures and put them into a table, where each row is a component
df = pd.DataFrame(complist)
for index, row in df.iterrows():
    df.at[index,0] = index
    df.at[index,1] = complist[index]
    df.at[index,2] = len(complist[index])
    df.at[index,3] = getAvgCentrality(complist[index], betw, nodesdict)
    df.at[index,4] = getAvgCentrality(complist[index], eigen, nodesdict)
    df.at[index,5] = getAvgCentrality(complist[index], closeness, nodesdict)
    df.at[index,6] = len(getTypeList(complist[index]))
    df.at[index,7] = getDiversity(complist[index])
df

In [None]:
#output resulted table
with codecs.open('EC_results_17122018.csv','wb', 'utf-8') as file:
    file.write('componentid!componentsize!avgbetwcent!avgeigencent!avgclosecent!types!iqv')
    file.write('\n')
    for index, row in df.iterrows():
        tempstr = str(row[0]) + "!" + str(row[2]) + "!" + str(row[3]) + "!" + str(row[4]) + "!" + str(row[5]) + "!" + str(row[6]) + "!" + str(row[7])
        tempstr = tempstr.replace(",", ":")
        file.write(tempstr)
        file.write('\n')

## Calculate SNA measures per country basis

In [None]:
#load total countries list
pickle_in = open("countrylist.pickle","rb")
countrylist = pickle.load(pickle_in)

In [None]:
#get Avg centralities for all countries
with codecs.open('EC_results_countries_17122018.csv','wb', 'utf-8') as file:
    file.write('Country!Avg Betw Centrality!Avg Eigenvector Centrality!Avg Closeness Centrality!Number of records')
    file.write('\n')
    for country in countrylist:
        tempstr = country + "!" + str(getAvgBetwCentCountry(country, betw, nodesdict)) + "!" + str(getAvgBetwCentCountry(country, eigen, nodesdict)) + "!" + str(getAvgBetwCentCountry(country, closeness, nodesdict)) + "!" + str(len(getCountryNodes(country)))
        file.write(tempstr)
        file.write('\n')

## Functions

In [None]:
#calculate average centrality
def getAvgCentrality(component, inputbetwCent, nodesdict):
    sum = 0
    avg = 0
    for entity in component:
        sum = sum + inputbetwCent[nodesdict[entity]]
    if len(component)>0:
        avg = sum / len(component)
    return avg;

In [None]:
#returns the list of unique record types of the given component
def getTypeList(component):
    entTypeList = list()
    for entity in component:
        enttype = getEntityType(entity)
        if enttype not in entTypeList:
            entTypeList.append(enttype)
    result = entTypeList
    return result;

In [None]:
#returns the type of the given record
def getEntityType(entity):
    result = G.node[entity]['type']
    return result;

In [None]:
#for selected country, get all nodes with those countries, find their centralities, return average
def getAvgBetwCentCountry(country, centrality, nodesdict):
    sum = 0
    counter = 0
    for index,node in G.nodes(data=True):
        if str(node['countries']) <> 'nan':
            if country in node['countries']:
                sum = sum + centrality[nodesdict[index]]
                counter = counter + 1
    if counter > 0:
        result = sum/counter
    else:
        result = -1
    return result

In [None]:
#returns the number of records in the network for the given country
def getCountryNodes(country):
    countrynodelist = list()
    for index,node in G.nodes(data=True):
        if str(node['countries']) <> 'nan':
            if country in node['countries']:
                countrynodelist.append(node)
    return countrynodelist

In [None]:
def getEntityYear(entity):
    #G.node['entity_Name']['attribute']
    result = G.node[entity]['year']
    #print result
    return result;

In [None]:
def getDiversity(component):
    percperType = calcPctperType(component) #returns dict in the form of "type:percentage"
    sumSq = calcSumSquare(component) #returns sum of squares
    iqv = calcIQV(19,sumSq)#TODO automate K-parameter
    return iqv;

In [None]:
def getTypeDict(component): #returns dictionary in the form of "type:occurences" per component
    entTypeDict = dict()
    for entity in component:
        entTypeDict = addtoTypeDict(entity, entTypeDict)
    #print entTypeList
    return entTypeDict;

In [None]:
def addtoTypeDict(entity, entTypeDict): #takes entity, adds the number of occurences for type of this entity to the general dictionary of types
    enttype = getEntityType(entity)
    if enttype not in entTypeDict:
        entTypeDict[enttype] = 1
    else:
        entTypeDict[enttype] = entTypeDict[enttype] + 1
    return entTypeDict;

In [None]:
#function that takes year as an input and returns subgraph for that year including years before that
def getSubGraph(year):
    subnodes = list()
    #obtain list of nodes for that year
    for node,fields in G.nodes(data=True):
        print node, fields, fields['year'], year
        if not math.isnan(fields['year']):
            #print int(round(fields['year'])), year
            if int(round(fields['year'])) <= int(year):
                #print node
                subnodes.append(node)
    print subnodes
    subgraph = G.subgraph(subnodes);
    return subgraph;

In [None]:
#gets earliest year of all neighbours
def getNeighborYear(node):
    #print node
    result = 5000
    for neighbor in G.neighbors(node):
        neighborYear = G.node[neighbor]['year']
        #print neighbor, neighborYear
        if result > neighborYear:
            result = neighborYear 
    return result;

In [None]:
#takes component, returns a dictionary in form of type:percentage
#slightly differs from SNA.xls, because in excel, grants were subtracted from the percentage - here we dont subtract
def calcPctperType(component): 
    typeOccurDict = getTypeDict(component)
    typePctDict = dict()
    #find sum of occurences
    sum = 0.0
    for row in typeOccurDict:
        sum = sum + typeOccurDict[row]
    #print sum
    for row in typeOccurDict:
        #print row
        #print "occurences:" + str(typeOccurDict[row])
        typePctDict[row] = 100*typeOccurDict[row]/sum
        #print "pcts:" + str(typePctDict[row])
    return typePctDict;

In [None]:
#returns a sum of squares for type occurence percentages for each component
def calcSumSquare(component):
    sumSqDict = dict()
    pctperType = calcPctperType(component)
    sumSQ = 0.0
    for row in pctperType:
        sumSQ = sumSQ + pctperType[row]*pctperType[row]
        sumSqDict[row] = sumSQ
        #print row
        #print pctperType[row]
    #print sumSQ
    return sumSQ;

In [None]:
def calcIQV(k, sumSQ):
    iqv = k * (10000 - sumSQ)/(10000*(k-1))
    return iqv;

## Functions for time analysis

In [None]:
#first creates a subgraph for a given year, then returns the betweenness centrality for that year
def getBetwCent(year):
    Y = getSubGraph(year)
    yearbetwcent = nx.betweenness_centrality(Y)
    print year
    print yearbetwcent
    return yearbetwcent;

In [None]:
#calculates the list of betweenness centralities once for each, retrieved later
def getBetwCentDict(startyear, finishyear):
    betwcentdict = dict()
    for year in range(startyear, finishyear):
        yearbetwcent = getBetwCent(year)
        betwcentdict[year] = yearbetwcent
    return betwcentdict;

In [None]:
#function that takes a component list as an input and produces dataframe with components as rows and SNAs as columns
def getSNAdf (incomplist, inputbetwcent):
    subdf = pd.DataFrame(incomplist)
    for index, row in subdf.iterrows():
        subdf.at[index,0] = incomplist[index]
        subdf.at[index,1] = len(incomplist[index])
        subdf.at[index,2] = getAvgBetwCent(incomplist[index], inputbetwcent)
        subdf.at[index,3] = len(getTypeList(incomplist[index]))
        subdf.at[index,4] = getDiversity(incomplist[index])
        subdf.at[index,5] = getTypeList(incomplist[index])
    return subdf;

In [None]:
yeardf = getSNAdf(subcomplist, subbetwCent)
yeardf

In [None]:
totaldf = getSNAdf(complist, betwCent)
totaldf

In [None]:
#get list of dataframes with SNA results for each year
def getSNAdfdict(start, finish):
    SNAdfdict = dict()
    for year in range(start, finish):
        yearcomplist = getSubCompList(complist, year)
        yearbetwcent = globalbetwcentdict[year]
        yearSNAdf = getSNAdf(yearcomplist, yearbetwcent)
        SNAdfdict[year] = yearSNAdf
    return SNAdfdict;

In [None]:
SNAdfdict = getSNAdfdict(2010,2018)
SNAdfdict[2014]

In [None]:
#input component index, output SNA measures by years for ONE component
def getcompSNAbyYears(componentIndex, inputSNAdfdict):
    yearlist = list(range(2010, 2018))
    compSNAdf = pd.DataFrame(index=['component_entities','size','avgBetwCent','numberoftypes','diversity'], columns=yearlist)
    #for yearindex,yearSNAdf in enumerate(inputSNAdflist):#for each dataframe for a year
    for year in yearlist:
        #get measures for componentindex
        #compSNAdf.at['component_entities', year] = inputSNAdfdict[year][0][componentIndex]
        compSNAdf.at['size', year] = inputSNAdfdict[year][1][componentIndex]
        compSNAdf.at['avgBetwCent', year] = inputSNAdfdict[year][2][componentIndex]
        compSNAdf.at['numberoftypes', year] = inputSNAdfdict[year][3][componentIndex]
        compSNAdf.at['diversity', year] = inputSNAdfdict[year][4][componentIndex]
        compSNAdf.at['types', year] = inputSNAdfdict[year][5][componentIndex]
        #compSNAdf['component_entities'][yearlist[yearindex]] = inputSNAdflist[componentIndex][0]
    return compSNAdf;

In [None]:
outcompSNAdf = getcompSNAbyYears(1,SNAdfdict)
outcompSNAdf

In [None]:
#input list with component indexes, output SNA measures by years for ALL SPECIFIED components
def getcompSNAbyYearsAgg(componentIndexList, inputSNAdfdict):
    yearlist = list(range(2010, 2018))
    compSNAdf = pd.DataFrame(index=['component_entities','aggsize','aggavgBetwCent','aggnumberoftypes','aggdiversity'], columns=yearlist)
    #for yearindex,yearSNAdf in enumerate(inputSNAdflist):#for each dataframe for a year
    for year in yearlist:
        #aggregate sizes for all the components in the list
        compSNAdf.at['aggsize', year] = aggCompSizes(componentIndexList,inputSNAdfdict, year)
        #aggregate avgBetwCent for all the components in the list divide by number of components
        compSNAdf.at['aggavgBetwCent', year] = aggavgBetwCent(componentIndexList,inputSNAdfdict, year)
        #get number of unique types for this list of components
        compSNAdf.at['aggnumberoftypes', year] = len(countUniqueTypes(componentIndexList,inputSNAdfdict, year))
        #calculate diversity for a given list of components
        compSNAdf.at['aggdiversity', year] = aggDiversity(componentIndexList,inputSNAdfdict, year)
        #compSNAdf.at['avgBetwCent', year] = inputSNAdfdict[year][2][componentIndex]
        #compSNAdf.at['numberoftypes', year] = inputSNAdfdict[year][3][componentIndex]
        #compSNAdf.at['diversity', year] = inputSNAdfdict[year][4][componentIndex]
        #compSNAdf['component_entities'][yearlist[yearindex]] = inputSNAdflist[componentIndex][0]
    return compSNAdf;

In [None]:
#takes list of component indexes, aggregates their sizes into one sum
def aggCompSizes(componentIndexList,inputSNAdfdict,year):
    sum = 0
    for componentIndex in componentIndexList:
        sum = sum + inputSNAdfdict[year][1][componentIndex]
    return sum;

In [None]:
#takes list of component indexes, finds an average of their BetwCentralities 
def aggavgBetwCent(componentIndexList,inputSNAdfdict,year):
    sum = 0
    for componentIndex in componentIndexList:
        sum = sum + inputSNAdfdict[year][2][componentIndex]
    avg = sum / len(componentIndexList)
    return avg;

In [None]:
#takes list of component indexes, returns a number of unique types for the whole list
def countUniqueTypes(componentIndexList,inputSNAdfdict, year):
    uniquetypes = list()
    for componentIndex in componentIndexList:
        comptypes = inputSNAdfdict[year][5][componentIndex]
        for comptype in comptypes:
            if comptype not in uniquetypes:
                uniquetypes.append(comptype)
    return uniquetypes;

In [None]:
#takes list of component indexes, returns diversity based on number of unique types
def aggDiversity(componentIndexList,inputSNAdfdict, year):
    uniquetypes = countUniqueTypes(componentIndexList,inputSNAdfdict, year)
    totalcompentities = list()
    #get aggregated component and pass it to getDiversity
    for componentIndex in componentIndexList:
        #retrieve component entities and sum them
        compentities = inputSNAdfdict[year][0][componentIndex]
        for compentity in compentities:
            totalcompentities.append(compentity)
    #for each type, count entities of this type
    typeOccurrences = getTypeDict(totalcompentities) #dictionary with types and their occurences
    aggdiversity = getDiversity(totalcompentities)
    #print len(totalcompentities)
    #print typeOccurrences
    #print aggdiversity
    return aggdiversity;

In [None]:
#function that takes list of components, and outputs another list of components, 
#where each component only has nodes with a year that is equal or earlier to the specified year
def getSubCompList(complist, year):
    outcomplist = list()
    for index, component in enumerate(complist):
        tempcomp = list()
        for entity in component:
            #if entity.year is less or equal to the specified year, put into resulting component
            if getEntityYear(entity) <= year:
                #put entity into component
                tempcomp.append(entity)
            #put new component into output component list
            #print index, tempcomp
        outcomplist.append(tempcomp)
    #print len(complist)
    #print outcomplist
    return outcomplist