# Whole Earth Book Catalog: SNAP

During the 2019 Spring Semester, we used the Stanford Network Analysis Project Python Library to build and analyze our network <br>
*For more information about snap: <br>
http://snap.stanford.edu/snappy/index.html <br>
For the documentation, tutorials, and reference manual:<br>
http://snap.stanford.edu/snappy/doc/index.html<br>*

# -> dataimport.py

Creates the initial graph of all data

In [1]:
import snap
import time
import os, sys
import csv

## *data import functions*

**loadNodes:**

In [2]:
def loadNodes(g, nodes, nodefile):
  with open(nodefile,'rb') as nodefile:
    fileReader = csv.reader(nodefile, delimiter='\t')
    for row in fileReader:
      key = int(row[0])
# if the node is not already in the dict + graph, add it
      if nodes.get(key, -1) == -1:
        value = row[1:]
        if value != "None":
            nodes.update({key:value})
            g.AddNode(key)

**loadRels:**

In [3]:
def loadRels(g, nodes, relfile):
  with open(relfile, 'rb') as relfile:
    fileReader = csv.reader(relfile, delimiter='\t')
    for row in fileReader:
        if nodes.get(int(row[0]))[0] != "None" and nodes.get(int(row[1]))[0] != "None":
            g.AddEdge(int(row[0]), int(row[1]))

**outputGraph:**

In [4]:
def outputGraph(graph_name, g, output_dir):
    if not os.path.exists(output_dir):
      os.makedirs(output_dir)
    outputPath = os.path.join(output_dir, graph_name)
    print ("Saved in: %s") % outputPath
    FOut = snap.TFOut(outputPath)
    g.Save(FOut)
    FOut.Flush()

## *main function*

Uses the above functions to make graph with the small batch and prints the number of nodes and edges as well as iterating through them

In [5]:
# run to create graph and print number of nodes and edges
'''
nodefile = "data/small_batch_100000.tsv"
relfile = "data/small_rel_batch_100000.tsv"
'''

# uncomment to use smaller size
nodefile = "data/small_batch_100.tsv"
relfile = "data/small_rel_batch_100.tsv"


nodes = { }
g = snap.TNGraph.New()
loadNodes(g, nodes, nodefile)
loadRels(g, nodes, relfile)
#outputGraph("dhc_graph1", g, "graph/")
# get number of nodes and edges
print ("Number of nodes: ") + str(g.GetNodes())
print ("Number of edges: ") + str(g.GetEdges())

Number of nodes: 337
Number of edges: 298


In [6]:
# run to iterate through all nodes
print ("Iterating through nodes")
for NI in g.Nodes():
    print ("%s: %s") % (NI.GetId(), nodes.get(NI.GetId()))

Iterating through nodes
0: ['Hamburg', 'Place']
1: ['Dresden', 'Place']
2: ['Firenze', 'Place']
3: ['Paris', 'Place']
4: ['Bari', 'Place']
5: ['Mu\xcc\x88nchen', 'Place']
6: ['Roma', 'Place']
7: ['Berlin', 'Place']
10: ['Haarlem', 'Place']
11: ['Breslau', 'Place']
12: ["'s-Gravenhage", 'Place']
13: ['Amsterdam', 'Place']
15: ['Naarden', 'Place']
16: ['Milano', 'Place']
17: ['Padova', 'Place']
18: ['Antwerpen', 'Place']
19: ['Reichenberg', 'Place']
22: ['Bochum', 'Place']
23: ['Leipzig', 'Place']
26: ['Giessen', 'Place']
27: ['Berlin etc', 'Place']
28: ['Nu\xcc\x88rnberg', 'Place']
29: ['Karlsbad', 'Place']
30: ['Lie\xcc\x80ge', 'Place']
32: ['Berlin-Friedenau', 'Place']
33: ['Mu\xcc\x88hlhausen i. Thu\xcc\x88r.', 'Place']
34: ['Gu\xcc\x88tersloh', 'Place']
35: ['Brussel', 'Place']
39: ['Stuttgart', 'Place']
42: ['B.-Leipa', 'Place']
47: ['Bayreuth', 'Place']
48: ['Jena', 'Place']
49: ['Leiden', 'Place']
51: ['Voorhout', 'Place']
54: ['Utrecht', 'Place']
55: ['Go\xcc\x88ttingen', 'Place

In [7]:
# run to iterate through edges
print ("Iterating through edges")
for EI in g.Edges():
  print ("from %s (%s) to %s (%s)") % (EI.GetSrcNId(), nodes.get(int(EI.GetSrcNId()))[0], 
                                     EI.GetDstNId(), nodes.get(int(EI.GetDstNId()))[0])

Iterating through edges
from 10000000 (Das feldgraue spruchbuch) to 0 (Hamburg)
from 10000001 (Hölle am Panama-kanal, roman) to 1 (Dresden)
from 10000002 (Fiori freschi) to 2 (Firenze)
from 10000003 (Guerre et crises économiques face au chômage) to 3 (Paris)
from 10000004 (Il dissidio spirituale della Germania con l'Europa. (Ristampa)) to 4 (Bari)
from 10000005 (Schelmuffsky, wahrhaftige, kurïose und sehr gefährliche reisebeschreibung zu wasser und lande, herausgegeben von Will Vesper) to 5 (München)
from 10000006 (L'uomo senza volto, romanzo) to 6 (Roma)
from 10000007 (Moltke und das bauerntum) to 7 (Berlin)
from 10000008 (Wie ermittelt der betrieb das betriebsnotwendige kapital und die verbrauchsbedingte abschreibung?) to 0 (Hamburg)
from 10000009 (Die melodie Indiens) to 7 (Berlin)
from 10000010 (De gemeente in de wereld) to 10 (Haarlem)
from 10000011 (Tranzendentaler Gottesbeweis) to 11 (Breslau)
from 10000012 (Over het gebed) to 12 ('s-Gravenhage)
from 10000013 (Het christel

# -> stats.py

Takes a graph and runs statistics on it

In [8]:
snap.PrintInfo(g, "Python type PNGraph","stats.txt", False)
!cat "stats.txt"

Python type PNGraph: Directed
  Nodes:                    337
  Edges:                    298
  Zero Deg Nodes:           2
  Zero InDeg Nodes:         190
  Zero OutDeg Nodes:        49
  NonZero In-Out Deg Nodes: 100
  Unique directed edges:    298
  Unique undirected edges:  298
  Self Edges:               0
  BiDir Edges:              0
  Closed triangles:         0
  Open triangles:           660
  Frac. of closed triads:   0.000000
  Connected component size: 0.207715
  Strong conn. comp. size:  0.002967
  Approx. full diameter:    6
  90% effective diameter:  3.720302


# -> timeslice.py

Takes in two years and creates two text files (nodes & edges) of all of the nodes and edges between those two years

## *timeslice functions*

In [9]:
# loadNodes: loads all nodes into dictionary for access in const time in the next step
def loadNodesToDict(nodes, nodefile):
  with open(nodefile,'rb') as nodefile:
    fileReader = csv.reader(nodefile, delimiter='\t')
    for row in fileReader:
      key = int(row[0])
# if the node is not already in the dict + graph, add it
      if nodes.get(key, -1) == -1:
        value = row[1:]
        nodes.update({key:value})

In [10]:
# timeslice: takes in two years, which will serve as a range of years, and two filenames, 
# nodeout and relout. All nodes that fall within the range of years will be printed to 
# nodeout, and all edges that have a node within the range will be printed to relout. 
def timeslice(lower, upper, nodeout, relout):
    # editionIDs is a set because editions could be listed twice, and we want all unique editions
    editionIDs = set()
    for NI in g.Nodes():
        # if the node is an edition do the following 
        if NI.GetId() >= 10000000 and NI.GetId() < 20000000:
            # nodes.get(NI.GetId())[2] refers to the year of a node 
            curr = nodes.get(NI.GetId())[2]
            # some year fields have multiple years in them, this if-elif block handles
            # these cases separately by checking if the year(s) fall into the range,
            # and adding them to the edition ID set if so
            
            # if the length of the year is four characters (field has only one year)
            if len(curr) == 4:
              # enclosed in try-catch for possibility that curr could be non-numeric
                try:
                    curr = int(curr)
                    # if year falls into range, add to editionID set
                    if curr >= lower and curr <= upper:
                        editionIDs.add(NI.GetId())
                except:
                    pass
            #  "\xe2" is the '-', this means the year field contains two years 
            elif "\xe2" in curr:
                try:
                    curr1 = int(curr[0:3])
                    curr2 = int(curr[5:8])
                    # if a year falls into range, add to editionID set
                    if curr1 >= lower and curr1 <= upper:
                        editionIDs.add(NI.GetId())
                    if curr2 >= lower and curr2 <= upper:
                        editionIDs.add(NI.GetId())
                except:
                    pass
    relIDs = set()
    file2 = open(relout, "w")
    for EI in g.Edges():
        # check both source and destination of the edge to see if they're part of 
        # the set constructed above, this will indicate whether the edition
        # itself is from the year range
        if EI.GetSrcNId() in editionIDs:
            wr1 = str(EI.GetSrcNId()) + "\t" + str(EI.GetDstNId()) + "\n"
            file2.write(wr1)
            relIDs.add(EI.GetDstNId())
        elif EI.GetDstNId() in editionIDs:
            wr2 = str(EI.GetSrcNId()) + "\t" + str(EI.GetDstNId()) + "\n"
            file2.write(wr2)
            relIDs.add(EI.GetSrcNId())
        else:
            continue
    file2.close()
    
    # take all the nodes that are in editionIDs or relIDs as a precaution (or the sets)
    editionIDs = editionIDs | relIDs
    # this is your final node output
    file1 = open(nodeout, "w")
    for ID in editionIDs:
        file1.write(str(ID) + "\n")
    file1.close()

## *main function*

In [11]:
# dictionary of all nodes created 
nodes = { }
loadNodesToDict(nodes, nodefile)
nodeout = "timeslice/nodeout.txt"
relout = "timeslice/relout.txt"
# sample years
timeslice(1943, 1945, nodeout, relout)

print "nodes from timeslice:"
!cat "timeslice/nodeout.txt"
print "edges from timeslice:"
!cat "timeslice/relout.txt"

nodes from timeslice:
0
1
2
3
4
5
6
7
11
12
16
17
18
19
22
23
26
27
28
29
30
34
35
39
47
48
51
54
55
59
63
68
73
79
80
86
10000001
10000002
10000003
10000004
10000005
10000007
10000008
10000009
10000011
10000012
10000014
10000017
10000018
10000019
10000022
10000023
10000024
10000026
10000027
10000028
10000029
10000030
10000031
10000034
10000035
10000039
10000041
10000043
10000045
10000046
10000047
10000048
10000050
10000051
10000052
10000053
10000054
10000055
10000057
10000059
10000063
10000064
10000065
10000068
10000069
10000070
10000071
10000073
10000074
10000075
10000077
10000078
10000079
10000080
10000081
10000082
10000084
10000085
10000086
10000087
10000088
10000089
10000090
10000092
10000097
20000000
20000001
20000002
20000003
20000004
20000005
20000007
20000009
20000011
20000012
20000014
20000017
20000018
20000019
20000022
20000023
20000024
20000026
20000027
20000028
20000029
20000030
20000031
20000034
20000035
20000039
20000041
20000043
20000046
20000047
20000048
20000050
20000

# -> makegraph.py

Makes graph of timeslice data

## *makegraph functions*

In [12]:
# different than original because file inputs are different, look at contents of nodeout.txt
def loadNodes2(g, nodein):
    nodefile = open(nodein, "r")
    for row in nodefile:
        g.AddNode(int(row))

In [13]:
def loadRels2(g, relin):
  with open(relin, 'rb') as relfile:
    fileReader = csv.reader(relfile, delimiter='\t')
    for row in fileReader:
        g.AddEdge(int(row[0]), int(row[1]))

## *main function*

In [14]:
nodein = "timeslice/nodeout.txt"
relin = "timeslice/relout.txt"
g = snap.TNGraph.New()
loadNodes2(g, nodein)
loadRels2(g, relin)

# -> community.py

Takes a graph and returns community info

In [15]:
from collections import defaultdict

g = snap.ConvertGraph(snap.PUNGraph, g)

# conduct community analysis on a subgraph created previously 
CmtyV = snap.TCnComV()
modularity = snap.CommunityCNM(g, CmtyV)
cnt = 0
size = 0
commSize = defaultdict(int)

In [16]:
# print the output, specifically including the size of the community and each node's type (for future filtering)
for Cmty in CmtyV:
    print "Community: " + str(cnt)
    size = 0
    for NI in Cmty:
            nodeType = "None"
            if NI >= 0 and NI < 10000000:
                nodeType = "Place"
            elif NI >= 10000000 and NI < 20000000:
                nodeType = "Edition"
            elif NI >= 20000000 and NI < 30000000:
                nodeType = "Publisher"
            elif NI >= 30000000 and NI < 40000000:
                nodeType = "Person"
            print str(NI) + ": " + nodes.get(NI)[0] + " (" + nodeType + ")"
            size += 1
    commSize[cnt] = size
    cnt += 1

Community: 0
0: Hamburg (Place)
10000008: Wie ermittelt der betrieb das betriebsnotwendige kapital und die verbrauchsbedingte abschreibung? (Edition)
10000064: Heer und Staat in der deutschen Geschichte (Edition)
20000000: Hanseatische verlagsanstalt (Publisher)
20000064: Hanseatische Verlagsanstalt (Publisher)
30000008: Karl Schwantag (Person)
30000064: Ernst Rudolf Huber (Person)
Community: 1
1: Dresden (Place)
10000001: Hölle am Panama-kanal, roman (Edition)
20000001: F. Müller (Publisher)
30000001: Wilhelm Alexander von Tayenthal (Person)
Community: 2
2: Firenze (Place)
10000002: Fiori freschi (Edition)
20000002: Sansoni (Publisher)
30000002: Mario Praz (Person)
Community: 3
3: Paris (Place)
10000043: Vie de Fixlein, régent de cinquième (Edition)
10000050: La Victoire de Colmar, 20 janvier-9 février, 1945 (Edition)
10000070: Mon journal depuis la libération (Edition)
10000071: Au nom des silencieux (Edition)
10000075: Jean Jaurès. Conférence donnée le 16 février 1933 au T

In [17]:
rankfile = "community/ranks.txt"
# sort communities from largest to smallest and print to rank file
sort = sorted(commSize, key=commSize.get, reverse=True)
f = open(rankfile, "w")
f.write("Ranking by Community Size\n")
for num in range(0, len(commSize)):
    f.write("%s: %s\n" % (sort[num], commSize[sort[num]]))
f.close()

In [18]:
!cat "community/ranks.txt"

Ranking by Community Size
3: 38
7: 19
12: 10
5: 9
19: 9
0: 7
10: 7
15: 6
1: 4
2: 4
4: 4
6: 4
8: 4
9: 4
11: 4
13: 4
14: 4
16: 4
17: 4
18: 4
20: 4
21: 4
22: 4
23: 4
24: 4
25: 4
26: 4
27: 4
28: 4
29: 4
30: 4
31: 4
33: 4
34: 4
32: 3
35: 3
36: 3
37: 3
38: 3


# -> analysis.py

Takes a list of IDs and returns files of all people, publishers, and places in a community with those IDs

## analysis functions

In [19]:
def relatedNodes(comfile, rankfile, ids, prefix):
    communities = []
    places = [] 
    pubs = []
    people = []
    ranks = []
    f = open(comfile).read().splitlines()
    curr = 0
    for line in f:
        if line.split(': ')[0] == "Community":
            curr = int(line.split(': ')[1])
            continue
        elif int(line.split(": ")[0]) in ids:
            communities.append(curr)

    comFlag = False
    for line in f:
        if line.split(": ")[0] == "Community":
            if int(line.split(": ")[1]) in communities:
                comFlag = True
                curr = int(line.split(': ')[1])
            else:
                comFlag = False
        elif comFlag: 
            # categorizes all lines with a node into preson, place, or publisher
            # and adds to the appropriate list
            split = line.split("(")
            lineType = split[len(split)-1]
            if lineType == "Person)":
                elem = (line, curr)
                people.append(elem)
            elif lineType == "Place)":
                elem = (line, curr)
                places.append(elem)
            elif lineType == "Publisher)":
                elem = (line, curr)
                pubs.append(elem)
    # print rank ordered list of communities that the IDs in question are part of 
    rankfile = open(rankfile).read().splitlines()
    for line in rankfile:
        currCom = int(line.split(": ")[0])
        currRank = int(line.split(": ")[1])
        if currCom in communities:
            rank = (currCom, currRank)
            ranks.append(rank)
    printOutput(communities, people, places, pubs, ranks, prefix)

In [20]:
# printing (0, 1, or 2 specifies a type)
# 0 for printing people/places/publishers
# 1 for removing duplicates in printing
# 2 for the printing of ranks 
def printHelp(listo, filename, splitter, opt):
    if opt == 0:
        f = open(filename, "w")
        for p in listo:
            wr = str(p[1]) + "   ***   " + str(p[0].split(splitter)[0]) + "\n"
            f.write(wr)
        f.close()
    elif opt == 1:
        f = open(filename, "w")
        pSet = set(listo)
        for p in pSet:
            wr = str(p[0].split(splitter)[0]) + "\n"
            f.write(wr)
        f.close()
    else:
        f = open(filename, "w")
        for p in listo:
            wr = "Community " + str(p[1]) + " rank: " + str(p[0]) + "\n"
            f.write(wr)
        f.close()
    print("Printed to " + filename + ", size of " + str(len(listo)))

In [21]:
def printOutput(communities, people, places, pubs, ranks, prefix):
    printHelp(people, prefix+"people.txt", "(Person)", 0)
    printHelp(places, prefix+"places.txt", "(Place)", 0)
    printHelp(pubs, prefix+"pubs.txt", "(Publisher)", 0)
    #printHelp(people, prefix+"upeople.txt", "(Person)", 1)
    #printHelp(places, prefix+"uplaces.txt", "(Place)", 1)
    #printHelp(pubs, prefix+"upubs.txt", "(Publisher)", 1)
    printHelp(ranks, prefix+"ranks.txt", " ", 2) 

## *main function*

In [22]:
name = sys.argv[1]
path = "test/" + name
os.mkdir(path, 0755)
listo = []
for i in range(2, len(sys.argv)):
    listo.append(int(i))

relatedNodes("community.txt", "community/ranks.txt", listo, "test/" + name + "/" + name)

OSError: [Errno 17] File exists: 'test/-f'