In [None]:
'''
This notebook does several things:
- Contains code to calculate categorical modularity with respect to Binder categories
- Contains code to calculate modularity with respect to unsupervised clusters
- Contains code to calculate single-category modularity
- Contains examples of code to calculate correlations between modularity and downstream task performance
'''

In [10]:
'''
Calculate general categorical modularity. Read in your category file on line 12 and your matrix file on line 31.
To generate a matrix, run ftmatrices.py or musematrices.py depending on what model you want. Then, paste the
output file contents into an Excel sheet, split the data to columns, get rid of the [] characters, and re-download
as a csv.
'''
# imports
import matplotlib.pyplot as plt
import numpy as np
import csv 
from sklearn.neighbors import kneighbors_graph

categories = []
with open('words/categories_2.csv') as csvfile:    
	csvReader = csv.reader(csvfile)    
	for row in csvReader:        
		categories.append(row[0])

# get csv of the category names
counters = []
i = 0
count = "Living Things"  # first elt in categories file
for elt in categories:
    if elt == count:
        i += 1
    else:
        counters.append(i)
        i = 1
        count = elt
counters.append(i)  # for the last category

file = open("data/muse_finnish.csv")  # read in your matrix, no word labels
M = np.loadtxt(file, delimiter = ",")

lst = [2, 3, 4]
for k in lst:
    knn = kneighbors_graph(M, k, mode = 'connectivity', include_self = True) 
    knnmatrix = knn.toarray()

    # counting a_c (modularity paper notation, changing l of languages for c of categories)
    ac = []
    t = 0
    for c in counters:  # for each category
        cjtotal = 0
        for i in range(t, t + c):  # how many words in that category
            for j in knnmatrix[i]:  # count degree of that word / node
                if (j == 1):
                    cjtotal += 1
        ac.append(cjtotal)
        t += c

    # we need to divide by 2m according to the formula
    m = 0 
    for i in range(len(knnmatrix[0])):
        for j in range(len(knnmatrix[0])):
            if ((knnmatrix[i, j] == 1)):
                m += 1

  # now the true ac
    for i in range(len(ac)):
        ac[i] = ac[i]/(m)

    # now we compute ell (modularity paper), called ecc here (fraction of edges within the same category)
    ecc = []
    t = 0
    for c in counters:
        ecctotal = 0
        for i in range(t, t + c):
            for j in range(t, t + c):
                if (knnmatrix[i, j] == 1):
                    ecctotal += 1
        ecc.append(ecctotal)
        t += c

    # actually divided by 2m
    for i in range(len(ecc)):
        ecc[i] = ecc[i]/(m)

    # Given C total categories, we calculate the overall modularity Q
    Q = 0
    for i in range(len(counters)):
        Q += ecc[i] - ac[i] * ac[i]

    # finally, we normalize
    Qmax = 0
    for i in range(len(counters)):
        Qmax += ac[i] * ac[i]
    Qmax = 1 - Qmax
    Qnorm = Q/Qmax

    print(Qnorm)

0.13476501711795832
0.12784867033233047
0.12283431891275029
0:00:01.114690


In [18]:
'''
Compute the modularity of an unsupervised network generated using a community detection algorithm. Read in your
own files as with the previous cell
'''
import math
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import csv 
import networkx.algorithms.community as nx_comm
from sklearn.neighbors import kneighbors_graph
from networkx.algorithms import community
from networkx.algorithms.community import greedy_modularity_communities
from sklearn.metrics.cluster import normalized_mutual_info_score
from datetime import datetime

t_start = datetime.now()

binder_b_labels = []
binder_c_labels = []
binder_d_labels = []

binder_b = []
with open('words/categories_1.csv') as csvfile:    
	csvReader = csv.reader(csvfile)    
	for row in csvReader:        
		binder_b.append(row[0])      

# get csv of the category names
i = 0
count = "Concrete Objects"  # first elt in categories file
for elt in binder_b:
    if elt == count:
        binder_b_labels.append(i)
    else:
        i += 1
        binder_b_labels.append(i)
        count = elt

print(binder_b_labels)

binder_c = []
with open('words/categories_2.csv') as csvfile:    
	csvReader = csv.reader(csvfile)    
	for row in csvReader:        
		binder_c.append(row[0])      

# get csv of the category names
i = 0
count = "Living Things"  # first elt in categories file
for elt in binder_c:
    if elt == count:
        binder_c_labels.append(i)
    else:
        i += 1
        binder_c_labels.append(i)
        count = elt

print(binder_c_labels)

binder_d = []
with open('words/categories_3.csv') as csvfile:    
	csvReader = csv.reader(csvfile)    
	for row in csvReader:        
		binder_d.append(row[0])      

# get csv of the category names
i = 0
count = "Animals"  # first elt in categories file
for elt in binder_d:
    if elt == count:
        binder_d_labels.append(i)
    else:
        i += 1
        binder_d_labels.append(i)
        count = elt

print(binder_d_labels)

file = open("data/muse_finnish.csv")
M = np.loadtxt(file, delimiter=",")

values = [2, 3, 4]
for k in values:
    knn = kneighbors_graph(M, k, mode='connectivity', include_self=True) 
    knnmatrix = knn.toarray()
    G = nx.from_numpy_matrix(np.array(knnmatrix))

    c = list(greedy_modularity_communities(G))

    categories = []
    emerging_labels = []
    for i in range(500):
        emerging_labels.append(0)

    for i in range(len(c)):
        cluster = []
        for x in c[i]:
            cluster.append(x)
            emerging_labels[x] = i 
        cluster.sort()
        categories.append(cluster)

    result = nx_comm.modularity(G, categories)
    print(result)



[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [27]:
'''
Compute single-category modularities. SILVIA PROBABLY MOVE THIS TO THE single_category DIRECTORY AS ITS OWN .py FILE
'''
import math
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import csv 
from sklearn.neighbors import kneighbors_graph

file = open("data/muse_finnish.csv")  # each language, upload without the category words; LANGUAGE
M = np.loadtxt(file, delimiter=",")

# CATEGORIES D
categories = []
with open('words/categories_3.csv') as csvfile:    
	csvReader = csv.reader(csvfile)    
	for row in csvReader:        
		categories.append(row[0])

# get csv of the category names
counters = []
i = 0
count = "Animals"  # first elt in categories file
for elt in categories:
    if elt == count:
        i += 1
    else:
        counters.append(i)
        i = 1
        count = elt
counters.append(i)  # for the last category

lst = [2, 3, 4]
for k in lst:
    categories_modularity = []
    knn = kneighbors_graph(M, k, mode='connectivity', include_self=True) 
    knnmatrix = knn.toarray()

    # counting a_c (modularity paper notation, changing l of languages for c of categories)
    ac = []
    t = 0
    for c in counters:  # for each category
        cjtotal = 0
        for i in range(t, t+c):  # how many words in that category
            for j in knnmatrix[i]:  # count degree of that word / node
                if (j == 1):
                    cjtotal += 1
        ac.append(cjtotal)
        t += c

    # we need to divide by 2m according to the formula
    m = 0 
    for i in range(len(knnmatrix[0])):
        for j in range(len(knnmatrix[0])):
            if ((knnmatrix[i, j] == 1)):
                m += 1

    # now the true ac
    for i in range(len(ac)):
        ac[i] = ac[i]/(m)

    # now we compute ell (modularity paper), called ecc here (fraction of edges within the same category)
    ecc = []
    t = 0
    for c in counters:
        ecctotal = 0
        for i in range(t, t+c):
            for j in range(t, t+c):
                if (knnmatrix[i, j] == 1):
                    ecctotal += 1
        ecc.append(ecctotal)
        t += c

    # actually divided by 2m
    for i in range(len(ecc)):
        ecc[i] = ecc[i]/(m)

    # Given C total categories, we calculate the overall modularity Q
    Q = 0
    for i in range(len(counters)):
        Q += ecc[i] - ac[i]*ac[i]
        categories_modularity.append(ecc[i] - ac[i] * ac[i])  # currently not normalized
  
    # finally, we normalize
    Qmax = 0
    for i in range(len(counters)):
        Qmax += ac[i]*ac[i]
    Qmax = 1 - Qmax
    Qnorm = Q/Qmax

    print(categories_modularity)  # the whole array, UNNORMALIZED
    for i in range(len(counters)):
        categories_modularity[i] = categories_modularity[i] / Qmax
    # if we want it normalized, divide every element in the array by Qnorm
    print(categories_modularity) # NORMALIZED

# CATEGORIES C
categories = []
with open('words/categories_2.csv') as csvfile:    
	csvReader = csv.reader(csvfile)    
	for row in csvReader:        
		categories.append(row[0])     

# get csv of the category names
counters = []
i = 0
count = "Living Things"  # first elt in categories file
for elt in categories:
    if elt == count:
        i += 1
    else:
        counters.append(i)
        i = 1
        count = elt
counters.append(i)  # for the last category

for k in lst:
    categories_modularity = []
    knn = kneighbors_graph(M, k, mode = 'connectivity', include_self = True) 
    knnmatrix = knn.toarray()

    # counting a_c (modularity paper notation, changing l of languages for c of categories)
    ac = []
    t = 0
    for c in counters:  # for each category
        cjtotal = 0
        for i in range(t, t+c):  # how many words in that category
            for j in knnmatrix[i]:  # count degree of that word / node
                if (j == 1):
                    cjtotal += 1
        ac.append(cjtotal)
        t += c

    # we need to divide by 2m according to the formula
    m = 0 
    for i in range(len(knnmatrix[0])):
        for j in range(len(knnmatrix[0])):
            if ((knnmatrix[i, j] == 1)):
                m += 1

    # now the true ac
    for i in range(len(ac)):
        ac[i] = ac[i]/(m)

    # now we compute ell (modularity paper), called ecc here (fraction of edges within the same category)
    ecc = []
    t = 0
    for c in counters:
        ecctotal = 0
        for i in range(t, t+c):
            for j in range(t, t+c):
                if (knnmatrix[i, j] == 1):
                    ecctotal += 1
        ecc.append(ecctotal)
        t += c

    # actually divided by 2m
    for i in range(len(ecc)):
        ecc[i] = ecc[i]/(m)

    # Given C total categories, we calculate the overall modularity Q
    Q = 0
    for i in range(len(counters)):
        Q += ecc[i] - ac[i]*ac[i]
        categories_modularity.append(ecc[i] - ac[i] * ac[i]) 
  
    # finally, we normalize
    Qmax = 0
    for i in range(len(counters)):
        Qmax += ac[i]*ac[i]
    Qmax = 1 - Qmax
    Qnorm = Q/Qmax

    print(categories_modularity)  # the whole array, UNNORMALIZED
    for i in range(len(counters)):
        categories_modularity[i] = categories_modularity[i]/Qmax
    # if we want it normalized, divide every element in the array by Qnorm
    print(categories_modularity) # NORMALIZED

# CATEGORIES B
categories = []
with open('words/categories_1.csv') as csvfile:    
	csvReader = csv.reader(csvfile)    
	for row in csvReader:        
		categories.append(row[0])

# get csv of the category names
counters = []
i = 0
count = "Concrete Objects"  # first elt in categories file
for elt in categories:
    if elt == count:
        i += 1
    else:
        counters.append(i)
        i = 1
        count = elt
counters.append(i)  # for the last category

for k in lst:
    categories_modularity = []
    knn = kneighbors_graph(M, k, mode = 'connectivity', include_self = True) 
    knnmatrix = knn.toarray()

    # counting a_c (modularity paper notation, changing l of languages for c of categories)
    ac = []
    t = 0
    for c in counters:  # for each category
        cjtotal = 0
        for i in range(t, t+c):  # how many words in that category
            for j in knnmatrix[i]:  # count degree of that word / node
                if (j == 1):
                    cjtotal += 1
        ac.append(cjtotal)
        t += c

    # we need to divide by 2m according to the formula
    m = 0 
    for i in range(len(knnmatrix[0])):
        for j in range(len(knnmatrix[0])):
            if ((knnmatrix[i, j] == 1)):
                m += 1

    # now the true ac
    for i in range(len(ac)):
        ac[i] = ac[i]/(m)

    # now we compute ell (modularity paper), called ecc here (fraction of edges within the same category)
    ecc = []
    t = 0
    for c in counters:
        ecctotal = 0
        for i in range(t, t+c):
            for j in range(t, t+c):
                if (knnmatrix[i, j] == 1):
                    ecctotal += 1
        ecc.append(ecctotal)
        t += c

    # actually divided by 2m
    for i in range(len(ecc)):
        ecc[i] = ecc[i]/(m)

    # Given C total categories, we calculate the overall modularity Q
    Q = 0
    for i in range(len(counters)):
        Q += ecc[i] - ac[i] * ac[i]
        categories_modularity.append(ecc[i] - ac[i]*ac[i]) 
  
    # finally, we normalize
    Qmax = 0
    for i in range(len(counters)):
        Qmax += ac[i]*ac[i]
    Qmax = 1 - Qmax
    Qnorm = Q/Qmax

    print(categories_modularity)  # the whole array, UNNORMALIZED
    for i in range(len(counters)):
        categories_modularity[i] = categories_modularity[i]/Qmax
    # if we want it normalized, divide every element in the array by Qnorm
    print(categories_modularity) # NORMALIZED

[0.047864, 0.021324, 0.070276, 0.008676, 0.037864, 0.017516, 0.009804, 0.010744, 0.020296, 0.026844, 0.031704, 0.049083999999999996, 0.030556, 0.0126, 0.043636, 0.004804, 0.012676, 0.001964, 0.040223999999999996, 0.010516, 0.0254, 0.019704000000000003, 0.015600000000000001, 0.013676, 0.015516, 0.0161, 0.010676, 0.003856, 0.003936, 0.018324, 0.042296]
[0.050064012986686966, 0.02230413281230388, 0.0735061544512037, 0.009074782230329604, 0.03960437463914248, 0.018321102529558937, 0.010254629435932623, 0.011237835440601805, 0.02122888199017631, 0.02807785318014844, 0.03316123741705507, 0.0513400888650874, 0.03196047093475696, 0.013179144317906065, 0.04564167789334516, 0.005024810262160375, 0.013258637569347401, 0.0020542729714577387, 0.042072849289162974, 0.01099935568627779, 0.026567481402763018, 0.020609671400001677, 0.016317035822169413, 0.014304601404101853, 0.01622917486005004, 0.016840017739546638, 0.011166709899838503, 0.004033236546813157, 0.004116913653593513, 0.019166241308040535