# Whole Earth Book Catalog: SNAP

During the 2019 Spring Semester, we used the Stanford Network Analysis Project Python Library to build and analyze our network <br>
*For more information about snap: <br>
http://snap.stanford.edu/snappy/index.html <br>
For the documentation, tutorials, and reference manual:<br>
http://snap.stanford.edu/snappy/doc/index.html<br>*

# -> dataimport.py

Creates the initial graph of all data

In [None]:
import snap
import time
import os, sys
import csv

## *data import functions*

**loadNodes:**

In [None]:
def loadNodes(g, nodes, nodefile):
  with open(nodefile,'rb') as nodefile:
    fileReader = csv.reader(nodefile, delimiter='\t')
    for row in fileReader:
      key = int(row[0])
# if the node is not already in the dict + graph, add it
      if nodes.get(key, -1) == -1:
        value = row[1:]
        if value != "None":
            nodes.update({key:value})
            g.AddNode(key)

**loadRels:**

In [None]:
def loadRels(g, nodes, relfile):
  with open(relfile, 'rb') as relfile:
    fileReader = csv.reader(relfile, delimiter='\t')
    for row in fileReader:
        if nodes.get(int(row[0]))[0] != "None" and nodes.get(int(row[1]))[0] != "None":
            g.AddEdge(int(row[0]), int(row[1]))

**outputGraph:**

In [None]:
def outputGraph(graph_name, g, output_dir):
    if not os.path.exists(output_dir):
      os.makedirs(output_dir)
    outputPath = os.path.join(output_dir, graph_name)
    print ("Saved in: %s") % outputPath
    FOut = snap.TFOut(outputPath)
    g.Save(FOut)
    FOut.Flush()

## *main function*

Uses the above functions to make graph with the small batch and prints the number of nodes and edges as well as iterating through them

In [None]:
# run to create graph and print number of nodes and edges
'''
nodefile = "data/small_batch_100000.tsv"
relfile = "data/small_rel_batch_100000.tsv"
'''

# uncomment to use smaller size
nodefile = "data/small_batch_100.tsv"
relfile = "data/small_rel_batch_100.tsv"


nodes = { }
g = snap.TNGraph.New()
loadNodes(g, nodes, nodefile)
loadRels(g, nodes, relfile)
#outputGraph("dhc_graph1", g, "graph/")
# get number of nodes and edges
print ("Number of nodes: ") + str(g.GetNodes())
print ("Number of edges: ") + str(g.GetEdges())

In [None]:
# run to iterate through all nodes
print ("Iterating through nodes")
for NI in g.Nodes():
    print ("%s: %s") % (NI.GetId(), nodes.get(NI.GetId()))

In [None]:
# run to iterate through edges
print ("Iterating through edges")
for EI in g.Edges():
  print ("from %s (%s) to %s (%s)") % (EI.GetSrcNId(), nodes.get(int(EI.GetSrcNId()))[0], 
                                     EI.GetDstNId(), nodes.get(int(EI.GetDstNId()))[0])

# -> stats.py

Takes a graph and runs statistics on it

In [None]:
snap.PrintInfo(g, "Python type PNGraph","stats.txt", False)
!cat "stats.txt"

# -> timeslice.py

Takes in two years and creates two text files (nodes & edges) of all of the nodes and edges between those two years

## *timeslice functions*

In [None]:
# loadNodes: loads all nodes into dictionary for access in const time in the next step
def loadNodesToDict(nodes, nodefile):
  with open(nodefile,'rb') as nodefile:
    fileReader = csv.reader(nodefile, delimiter='\t')
    for row in fileReader:
      key = int(row[0])
# if the node is not already in the dict + graph, add it
      if nodes.get(key, -1) == -1:
        value = row[1:]
        nodes.update({key:value})

In [None]:
# timeslice: takes in two years, which will serve as a range of years, and two filenames, 
# nodeout and relout. All nodes that fall within the range of years will be printed to 
# nodeout, and all edges that have a node within the range will be printed to relout. 
def timeslice(lower, upper, nodeout, relout):
    # editionIDs is a set because editions could be listed twice, and we want all unique editions
    editionIDs = set()
    for NI in g.Nodes():
        # if the node is an edition do the following 
        if NI.GetId() >= 10000000 and NI.GetId() < 20000000:
            # nodes.get(NI.GetId())[2] refers to the year of a node 
            curr = nodes.get(NI.GetId())[2]
            # some year fields have multiple years in them, this if-elif block handles
            # these cases separately by checking if the year(s) fall into the range,
            # and adding them to the edition ID set if so
            
            # if the length of the year is four characters (field has only one year)
            if len(curr) == 4:
              # enclosed in try-catch for possibility that curr could be non-numeric
                try:
                    curr = int(curr)
                    # if year falls into range, add to editionID set
                    if curr >= lower and curr <= upper:
                        editionIDs.add(NI.GetId())
                except:
                    pass
            #  "\xe2" is the '-', this means the year field contains two years 
            elif "\xe2" in curr:
                try:
                    curr1 = int(curr[0:3])
                    curr2 = int(curr[5:8])
                    # if a year falls into range, add to editionID set
                    if curr1 >= lower and curr1 <= upper:
                        editionIDs.add(NI.GetId())
                    if curr2 >= lower and curr2 <= upper:
                        editionIDs.add(NI.GetId())
                except:
                    pass
    relIDs = set()
    file2 = open(relout, "w")
    for EI in g.Edges():
        # check both source and destination of the edge to see if they're part of 
        # the set constructed above, this will indicate whether the edition
        # itself is from the year range
        if EI.GetSrcNId() in editionIDs:
            wr1 = str(EI.GetSrcNId()) + "\t" + str(EI.GetDstNId()) + "\n"
            file2.write(wr1)
            relIDs.add(EI.GetDstNId())
        elif EI.GetDstNId() in editionIDs:
            wr2 = str(EI.GetSrcNId()) + "\t" + str(EI.GetDstNId()) + "\n"
            file2.write(wr2)
            relIDs.add(EI.GetSrcNId())
        else:
            continue
    file2.close()
    
    # take all the nodes that are in editionIDs or relIDs as a precaution (or the sets)
    editionIDs = editionIDs | relIDs
    # this is your final node output
    file1 = open(nodeout, "w")
    for ID in editionIDs:
        file1.write(str(ID) + "\n")
    file1.close()

## *main function*

In [None]:
# dictionary of all nodes created 
nodes = { }
loadNodesToDict(nodes, nodefile)
nodeout = "timeslice/nodeout.txt"
relout = "timeslice/relout.txt"
# sample years
timeslice(1943, 1945, nodeout, relout)

print "nodes from timeslice:"
!cat "timeslice/nodeout.txt"
print "edges from timeslice:"
!cat "timeslice/relout.txt"

# -> makegraph.py

Makes graph of timeslice data

## *makegraph functions*

In [None]:
# different than original because file inputs are different, look at contents of nodeout.txt
def loadNodes2(g, nodein):
    nodefile = open(nodein, "r")
    for row in nodefile:
        g.AddNode(int(row))

In [None]:
def loadRels2(g, relin):
  with open(relin, 'rb') as relfile:
    fileReader = csv.reader(relfile, delimiter='\t')
    for row in fileReader:
        g.AddEdge(int(row[0]), int(row[1]))

## *main function*

In [None]:
nodein = "timeslice/nodeout.txt"
relin = "timeslice/relout.txt"
g = snap.TNGraph.New()
loadNodes2(g, nodein)
loadRels2(g, relin)

# -> community.py

Takes a graph and returns community info

In [None]:
from collections import defaultdict

g = snap.ConvertGraph(snap.PUNGraph, g)

# conduct community analysis on a subgraph created previously 
CmtyV = snap.TCnComV()
modularity = snap.CommunityCNM(g, CmtyV)
cnt = 0
size = 0
commSize = defaultdict(int)

In [None]:
# print the output, specifically including the size of the community and each node's type (for future filtering)
for Cmty in CmtyV:
    print "Community: " + str(cnt)
    size = 0
    for NI in Cmty:
            nodeType = "None"
            if NI >= 0 and NI < 10000000:
                nodeType = "Place"
            elif NI >= 10000000 and NI < 20000000:
                nodeType = "Edition"
            elif NI >= 20000000 and NI < 30000000:
                nodeType = "Publisher"
            elif NI >= 30000000 and NI < 40000000:
                nodeType = "Person"
            print str(NI) + ": " + nodes.get(NI)[0] + " (" + nodeType + ")"
            size += 1
    commSize[cnt] = size
    cnt += 1

In [None]:
rankfile = "community/ranks.txt"
# sort communities from largest to smallest and print to rank file
sort = sorted(commSize, key=commSize.get, reverse=True)
f = open(rankfile, "w")
f.write("Ranking by Community Size\n")
for num in range(0, len(commSize)):
    f.write("%s: %s\n" % (sort[num], commSize[sort[num]]))
f.close()

In [None]:
!cat "community/ranks.txt"

# -> analysis.py

Takes a list of IDs and returns files of all people, publishers, and places in a community with those IDs

## analysis functions

In [None]:
def relatedNodes(comfile, rankfile, ids, prefix):
    communities = []
    places = [] 
    pubs = []
    people = []
    ranks = []
    f = open(comfile).read().splitlines()
    curr = 0
    for line in f:
        if line.split(': ')[0] == "Community":
            curr = int(line.split(': ')[1])
            continue
        elif int(line.split(": ")[0]) in ids:
            communities.append(curr)

    comFlag = False
    for line in f:
        if line.split(": ")[0] == "Community":
            if int(line.split(": ")[1]) in communities:
                comFlag = True
                curr = int(line.split(': ')[1])
            else:
                comFlag = False
        elif comFlag: 
            # categorizes all lines with a node into preson, place, or publisher
            # and adds to the appropriate list
            split = line.split("(")
            lineType = split[len(split)-1]
            if lineType == "Person)":
                elem = (line, curr)
                people.append(elem)
            elif lineType == "Place)":
                elem = (line, curr)
                places.append(elem)
            elif lineType == "Publisher)":
                elem = (line, curr)
                pubs.append(elem)
    # print rank ordered list of communities that the IDs in question are part of 
    rankfile = open(rankfile).read().splitlines()
    for line in rankfile:
        currCom = int(line.split(": ")[0])
        currRank = int(line.split(": ")[1])
        if currCom in communities:
            rank = (currCom, currRank)
            ranks.append(rank)
    printOutput(communities, people, places, pubs, ranks, prefix)

In [None]:
# printing (0, 1, or 2 specifies a type)
# 0 for printing people/places/publishers
# 1 for removing duplicates in printing
# 2 for the printing of ranks 
def printHelp(listo, filename, splitter, opt):
    if opt == 0:
        f = open(filename, "w")
        for p in listo:
            wr = str(p[1]) + "   ***   " + str(p[0].split(splitter)[0]) + "\n"
            f.write(wr)
        f.close()
    elif opt == 1:
        f = open(filename, "w")
        pSet = set(listo)
        for p in pSet:
            wr = str(p[0].split(splitter)[0]) + "\n"
            f.write(wr)
        f.close()
    else:
        f = open(filename, "w")
        for p in listo:
            wr = "Community " + str(p[1]) + " rank: " + str(p[0]) + "\n"
            f.write(wr)
        f.close()
    print("Printed to " + filename + ", size of " + str(len(listo)))

In [None]:
def printOutput(communities, people, places, pubs, ranks, prefix):
    printHelp(people, prefix+"people.txt", "(Person)", 0)
    printHelp(places, prefix+"places.txt", "(Place)", 0)
    printHelp(pubs, prefix+"pubs.txt", "(Publisher)", 0)
    #printHelp(people, prefix+"upeople.txt", "(Person)", 1)
    #printHelp(places, prefix+"uplaces.txt", "(Place)", 1)
    #printHelp(pubs, prefix+"upubs.txt", "(Publisher)", 1)
    printHelp(ranks, prefix+"ranks.txt", " ", 2) 

## *main function*

In [None]:
name = sys.argv[1]
path = "test/" + name
os.mkdir(path, 0755)
listo = []
for i in range(2, len(sys.argv)):
    listo.append(int(i))

relatedNodes("community.txt", "community/ranks.txt", listo, "test/" + name + "/" + name)