## We will transform a well-known planning literature into a network of word co-occurrence. This involves a machine learning package called scikit-learn, which allows us to extract co-occurrence patterns easily.

In [None]:
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import CountVectorizer
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def clean(s):
    
    s = s.lower() # all lowercase
    s = s.replace('[sic]','')
    s = s.lstrip('0123456789') # remove numbers
    s = re.sub(r'[^\w\s]','',s) # remove punctuations
    
    return s

doc = []

f = open(r'/Users/xyzjayne/Documents/CP290E/Lecture Files/Feb 20/athens_charter.txt','r')
text=f.readlines() # a list of every line in the text file
for line in text:
    sentences = line.split('.')[:-1]
    for s in sentences:
        doc.append(clean(s))

In [None]:
count_model = CountVectorizer(ngram_range=(1,1),stop_words = 'english') # default unigram model
X = count_model.fit_transform(doc)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # fill same word cooccurence to 0
print Xc

In [None]:
print count_model.vocabulary_

In [None]:
word_dict = {} # want a reverse dictionary that maps ids to words
for i in count_model.vocabulary_:
    num = count_model.vocabulary_[i]
    word_dict[num] = i

In [None]:
G=nx.Graph()

for i in range(Xc.shape[0]):
    for j in range(Xc.shape[1]):
        if Xc[i,j]>0:
            G.add_edge(word_dict[i],word_dict[j],weight = Xc[i,j]) # weight here indicates frequency of co-occurrence

In [None]:
G = max(nx.connected_component_subgraphs(G),key = len)
N = len(G.nodes)
print 'Number of nodes in largest connected components: {}'.format(N)

k_avg = np.mean(dict(G.degree()).values())
print 'Average degree: {}'.format(k_avg)

cc = nx.average_clustering(G)
sp = nx.average_shortest_path_length(G)
print 'Average clustering coefficient: {}'.format(cc)
print 'Average shortest path length: {}'.format(sp)

In [None]:
plt.figure()
plt.hist(dict(G.degree()).values())
plt.show()

In [None]:
# words with the highest degrees:
for key, value in sorted(dict(G.degree()).iteritems(), key=lambda (k,v): (v,k),reverse = True):
    print "%s: %s" % (key, value)

In [None]:
elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] >3] # high-weight edges
esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] <=3]

H = nx.Graph()
H.add_edges_from(elarge) # create a high-weight-edge-only graph for plotting


In [None]:
plt.figure(figsize=(8,8))
pos=nx.spring_layout(H) # positions for all nodes

# nodes
nx.draw_networkx(H, font_size = 10)
plt.title('Most Commonly Seen Word Co-occurrence in Athens Charter')
plt.axis('off')
plt.show() # display