In [None]:
%%capture
import itertools
import pickle 
import networkx as nx
from collections import deque, defaultdict, Counter
from tqdm import tqdm
import community as community_louvain
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import regex as re
import math
import nltk
nltk.download('punkt')

In [None]:
# Set if you're starting over
new_df = False

In [None]:
with open("base_graph.pkl", 'rb') as f:
    G_directed = pickle.load(f)
f.close()
G = G_directed.to_undirected()

In [None]:
if new_df:
    # compute the best partition
    partition = community_louvain.best_partition(G)

    # compute modularity
    mod = community_louvain.modularity(partition, G)

    number_of_communities = len(set(partition.values()))
    print('Using the Louvain algortihm we identified', number_of_communities, 'communities')
    
    print('Pickling...')
    pickle.dump(partition, open( "partition.pkl", "wb" ) )
    pickle.dump(mod, open( "mod.pkl", "wb" ) )
    
else:
    with open("partition.pkl", 'rb') as f:
        partition = pickle.load(f)
    
    with open("mod.pkl", 'rb') as f:
        mod = pickle.load(f)  
        
    number_of_communities = len(set(partition.values()))
    

In [None]:
def bfs_shortest_paths(G, root):
    shortest_paths_dict = {root: [[root]]}
    queue = deque([(root, [root])])

    while queue:
        s, path = queue.popleft()

        for neighbor in G.neighbors(s):
            new_path = path + [neighbor]
            old_path = shortest_paths_dict.get(neighbor, [[None] * (len(new_path) + 1)])

            if len(new_path) == len(old_path[0]):
                shortest_paths_dict[neighbor].append(new_path)
            elif len(new_path) < len(old_path[0]):
                shortest_paths_dict[neighbor] = [new_path]
                queue.append((neighbor, new_path))

    return shortest_paths_dict

def edge_betweenness_centrality(G):
    edge_betweenness = defaultdict(float)

    for node in G.nodes():
        shortest_paths_dict = bfs_shortest_paths(G, node)

        for paths in shortest_paths_dict.values():
            for path in paths:
                for i in range(len(path) - 1):
                    edge = (path[i], path[i + 1])
                    edge_betweenness[edge] += 1.0

    return edge_betweenness

def girvan_newman_directed(G):
    G_copy = G.copy()
    communities = list(nx.weakly_connected_components(G_copy))
    results = {0: communities}
    
    step = 1
    
    while G_copy.number_of_edges() > 0:
        edge_betweenness = edge_betweenness_centrality(G_copy)
        max_betweenness = max(edge_betweenness.values())
        highest_betweenness_edges = [edge for edge, value in edge_betweenness.items() if value == max_betweenness]
        G_copy.remove_edges_from(highest_betweenness_edges)
        components = list(nx.weakly_connected_components(G_copy))
        results[step] = components
        step += 1
    
    return results

def modularity(G, clusters_list):
    Q = 0
    m = len(list(G.edges()))
    for aCommunity in clusters_list:
        print("aCommunity", aCommunity)
        for v in list(aCommunity):
            for w in list(aCommunity):
                if v != w:
                    avw = 1 if (v,w) in list(G.edges()) or (w,v) in list(G.edges()) else 0               
                    new_term = avw - (G.degree(v)*G.degree(w))/(2*m)
                    Q += new_term
    return Q/(2*m)

def compute_modularity_for_all_communities(G, all_communities):
    result = []
    t = tqdm(total=len(list(all_communities.values())))
    for aCommunityRepartition in list(all_communities.values()):
        t.update()
        aModularity = modularity(G, aCommunityRepartition)
        result.append(
            [aCommunityRepartition, aModularity]
        )
    t.close    
    return result

    
B = nx.DiGraph()
all_nodes = ['A','B','C','D','E','F','G','H']
B.add_nodes_from(all_nodes)
all_edges = [
    ('E','D'),('E','F'),
    ('F','G'),('D','G'),('D','B'),('B','A'),('B','C'),('A','H'),
    ('D','F'),('A','C')
]
B.add_edges_from(all_edges)

print('Finding communities...')
all_com = girvan_newman_directed(B)

print('Finding the modularity...')    
all_clusters_with_modularity = compute_modularity_for_all_communities(B, all_com)

print('Sorting')
all_clusters_with_modularity.sort(key= lambda x:x[1], reverse=True)

print('Finding the best and pickling')
best_cluster = all_clusters_with_modularity[0]
print(best_cluster)
#pickle.dump(best_cluster, open( "best_cluster.pkl", "wb" ) )
    
    

In [None]:
## Dict where the key is the cluster number and the values
'''
community_dict[2] = [['27c5ea64-86cb-4e69-9d13-c8ba2654515d'],
 ['2ee9a087-6188-4ebd-95b9-6561cba0584c'],
 ['efe2dd1d-706c-4ab6-bd9b-90d35a81d04f']]
'''

community_dict = {new_list: [] for new_list in range(number_of_communities)}
for i, j in partition.items():  
    community_dict[j].append([i])
    
# Filter out communities with only one element
community_dict_bigger_than_one = {k: v for k, v in community_dict.items() if len(v) > 1}

### Community sizes

In [None]:
community_size_bigger_than_one = np.zeros(len(community_dict_bigger_than_one))

for i,j in enumerate(community_dict_bigger_than_one):
    community_size_bigger_than_one[i] = (len(community_dict_bigger_than_one[j])) 

community_size = np.zeros(number_of_communities)

for i,j in enumerate(community_dict):
    community_size[i] = (len(community_dict[j]))
    
print('There is:', sum(community_size == 1), 'communities with only 1 member')
print('There is:', len(community_dict_bigger_than_one),'communities with more than 1 member')


### Clean and tokenize abstracts 

In [None]:
def clean_and_tokenize(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = nltk.tokenize.word_tokenize(text)# Tokenize text
    return tokens

Creating a dict for each **paper** with their words

In [None]:
## Tokenized words in each paper
new_df = False
if new_df:
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': "For stereoscopic optical see-through head-mounted display...
    abstracts = nx.get_node_attributes(G, 'abstract')
    
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': 'Modeling Physical Structure as Additional Constraints for Stereoscopic Optical See-Through Head-Mounted Display Calibration'
    titles = nx.get_node_attributes(G, 'title')

    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': title + abstract}
    paper_dict = {key: titles.get(key, '') + ' ' + abstracts.get(key, '') for key in set(abstracts) | set(titles)}
    
    # paper_dict_clean['345a2369-8198-46db-8ebb-b3f622b35381'] = ['scalable', 'feature', 'extraction', 'for', 'coarse',
    paper_dict_clean = {key:  clean_and_tokenize(text) for key, text in paper_dict.items()}
    
    print('Pickeling...')
    pickle.dump(paper_dict_clean, open( "paper_dict_clean.pkl", "wb" ) )
    
else:
    with open("paper_dict_clean.pkl", 'rb') as f:
        paper_dict_clean = pickle.load(f)
    


Creating a dict for each **cluster** with all words from the papers it contains

In [None]:
# Making
new_df = True

if new_df:
    community_texts_clean = {new_list: [] for new_list in range(len(community_dict_bigger_than_one))}
    
    for cluster_id, paper_ids in enumerate(community_dict_bigger_than_one.values()):
        for paper_id in paper_ids:  
            community_texts_clean[cluster_id].extend(paper_dict_clean[paper_id[0]])          
    
    #print('Pickling...')
    #pickle.dump(community_text_clean, open( "community_text_clean.pkl", "wb" ) )
    
else:
    with open("community_text_clean.pkl", 'rb') as f:
        community_text_clean = pickle.load(f)

Turn the dicts into nltk format

In [None]:
# Turn into nltk format
community_text_clean_text = { cluster_id: nltk.Text(text) for cluster_id, text in community_texts_clean.items() } 

# {'345a2369-8198-46db-8ebb-b3f622b35381': <Text: scalable feature extraction for coarse to fine jpeg...>,
paper_dict_clean_text = { paper_id: nltk.Text(text) for paper_id, text in paper_dict_clean.items() } 

In [None]:
paper_dict_clean_text['345a2369-8198-46db-8ebb-b3f622b35381']

# TF-IDF analysis

TF-IDF for each **community**, looking at the 'document' as a cluster 

In [None]:
# TF for each community
# TF_clusters[100] = Counter({'the': 28, 'of': 23,'in': 14,'we': 13,
TF_clusters = {}
word_set = set()

for cluster, text in community_text_clean_text.items():
    overall_freq = Counter()           
    try:
        fd = nltk.FreqDist(text)
        word_set.update(set(list(fd.keys())))
        overall_freq = overall_freq + Counter(fd)
    except:
        print('Breaked')
        continue
        
    TF_clusters[cluster] = overall_freq


In [None]:
pickle.dump(TF_clusters, open( "TF_clusters.pkl", "wb" ) )

In [None]:
paper_dict_clean_text['01f02fae-97df-4207-a386-a1bc8ec0853b']

TF-IDF for each **paper** inside their community, looking at the 'document' as the paper

In [None]:
# TF for each paper in the community
TF_papers = {}
word_set = set()

for cluster, paper_ids in community_dict_bigger_than_one.items(): 
    TF_papers[cluster] = {}
    
    for paper_id in paper_ids:         
        overall_freq = Counter()     
                     
        try:
            text = paper_dict_clean_text[paper_id[0]]
            print(text)
            fd = nltk.FreqDist(text)
            word_set.update(set(list(fd.keys())))
            overall_freq = overall_freq + Counter(fd)
            
        except:
            print('Breaked')
            continue
        
        TF_papers[cluster][paper_id[0]] = overall_freq

In [None]:
TF_papers[0]['01f02fae-97df-4207-a386-a1bc8ec0853b']

In [None]:
def idf(word):
    occ = sum(1 for i in TF_clusters if word in TF_clusters[i])
    return np.log(10 / max(1, occ))

def tf_idf(number, com):    
    total_words = len(TF_clusters[number])
    return {word: (TF_clusters[number][word] / total_words) * idf(word) for word in com}


In [None]:
tf_idf_all_communities = [tf_idf(i, TF_clusters.get(i)) for i in TF_clusters]

In [None]:
community = tf_idf_all_communities[6]
sorted(community, key=community.get, reverse=True)[:100]

In [None]:
pickle.dump(tf_idf_all_communities, open("tf_idf_all_communities.pkl", "wb")) 