In [2]:
%%capture
import itertools
import pickle 
import networkx as nx
from collections import deque, defaultdict, Counter
from tqdm import tqdm
import community as community_louvain
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import regex as re
import math
import nltk
nltk.download('punkt')

In [3]:
# Set if you're starting over
new_df = False

In [4]:
with open("base_graph.pkl", 'rb') as f:
    G_directed = pickle.load(f)
f.close()
G = G_directed.to_undirected()

In [5]:
if new_df:
    # compute the best partition
    partition = community_louvain.best_partition(G)

    # compute modularity
    mod = community_louvain.modularity(partition, G)

    number_of_communities = len(set(partition.values()))
    print('Using the Louvain algortihm we identified', number_of_communities, 'communities')
    
    print('Pickling...')
    pickle.dump(partition, open( "partition.pkl", "wb" ) )
    pickle.dump(mod, open( "mod.pkl", "wb" ) )
    
else:
    with open("partition.pkl", 'rb') as f:
        partition = pickle.load(f)
    
    with open("mod.pkl", 'rb') as f:
        mod = pickle.load(f)  
        
    number_of_communities = len(set(partition.values()))
    

In [39]:
def bfs_shortest_paths(G, root):
    shortest_paths_dict = {root: [[root]]}
    queue = deque([(root, [root])])

    while queue:
        s, path = queue.popleft()

        for neighbor in G.neighbors(s):
            new_path = path + [neighbor]
            old_path = shortest_paths_dict.get(neighbor, [[None] * (len(new_path) + 1)])

            if len(new_path) == len(old_path[0]):
                shortest_paths_dict[neighbor].append(new_path)
            elif len(new_path) < len(old_path[0]):
                shortest_paths_dict[neighbor] = [new_path]
                queue.append((neighbor, new_path))

    return shortest_paths_dict

def edge_betweenness_centrality(G):
    edge_betweenness = defaultdict(float)

    for node in G.nodes():
        shortest_paths_dict = bfs_shortest_paths(G, node)

        for paths in shortest_paths_dict.values():
            for path in paths:
                for i in range(len(path) - 1):
                    edge = (path[i], path[i + 1])
                    edge_betweenness[edge] += 1.0

    return edge_betweenness

def girvan_newman_directed(G):
    G_copy = G.copy()
    communities = list(nx.weakly_connected_components(G_copy))
    results = {0: communities}
    
    step = 1
    
    while G_copy.number_of_edges() > 0:
        edge_betweenness = edge_betweenness_centrality(G_copy)
        max_betweenness = max(edge_betweenness.values())
        highest_betweenness_edges = [edge for edge, value in edge_betweenness.items() if value == max_betweenness]
        G_copy.remove_edges_from(highest_betweenness_edges)
        components = list(nx.weakly_connected_components(G_copy))
        results[step] = components
        step += 1
    
    return results

def modularity(G, clusters_list):
    Q = 0
    m = len(list(G.edges()))
    for aCommunity in clusters_list:
        print("aCommunity", aCommunity)
        for v in list(aCommunity):
            for w in list(aCommunity):
                if v != w:
                    avw = 1 if (v,w) in list(G.edges()) or (w,v) in list(G.edges()) else 0               
                    new_term = avw - (G.degree(v)*G.degree(w))/(2*m)
                    Q += new_term
    return Q/(2*m)

def compute_modularity_for_all_communities(G, all_communities):
    result = []
    t = tqdm(total=len(list(all_communities.values())))
    for aCommunityRepartition in list(all_communities.values()):
        t.update()
        aModularity = modularity(G, aCommunityRepartition)
        result.append(
            [aCommunityRepartition, aModularity]
        )
    t.close    
    return result

    
B = nx.DiGraph()
all_nodes = ['A','B','C','D','E','F','G','H']
B.add_nodes_from(all_nodes)
all_edges = [
    ('E','D'),('E','F'),
    ('F','G'),('D','G'),('D','B'),('B','A'),('B','C'),('A','H'),
    ('D','F'),('A','C')
]
B.add_edges_from(all_edges)

print('Finding communities...')
all_com = girvan_newman_directed(B)

print('Finding the modularity...')    
all_clusters_with_modularity = compute_modularity_for_all_communities(B, all_com)

print('Sorting')
all_clusters_with_modularity.sort(key= lambda x:x[1], reverse=True)

print('Finding the best and pickling')
best_cluster = all_clusters_with_modularity[0]
print(best_cluster)
#pickle.dump(best_cluster, open( "best_cluster.pkl", "wb" ) )
    
    

Finding communities...
Finding the modularity...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 5579.39it/s]

aCommunity {'C', 'H', 'A', 'G', 'D', 'B', 'E', 'F'}
aCommunity {'A', 'C', 'B', 'H'}
aCommunity {'G', 'D', 'E', 'F'}
aCommunity {'A', 'C', 'B'}
aCommunity {'D', 'F'}
aCommunity {'E'}
aCommunity {'G'}
aCommunity {'H'}
aCommunity {'A'}
aCommunity {'B'}
aCommunity {'C'}
aCommunity {'D'}
aCommunity {'E'}
aCommunity {'F'}
aCommunity {'G'}
aCommunity {'H'}
Sorting
Finding the best and pickling
[[{'A', 'C', 'B', 'H'}, {'G', 'D', 'E', 'F'}], 0.5349999999999999]





In [6]:
'''
community_dict[2] = [['27c5ea64-86cb-4e69-9d13-c8ba2654515d'],
 ['2ee9a087-6188-4ebd-95b9-6561cba0584c'],
 ['efe2dd1d-706c-4ab6-bd9b-90d35a81d04f']]
'''

community_dict = {new_list: [] for new_list in range(number_of_communities)}
for i, j in partition.items():  
    community_dict[j].append([i])
    
# Filter out communities with only one element
community_dict_bigger_than_one = {k: v for k, v in community_dict.items() if len(v) > 1}

In [7]:
community_size_bigger_than_one = np.zeros(len(community_dict_bigger_than_one))

for i,j in enumerate(community_dict_bigger_than_one):
    community_size_bigger_than_one[i] = (len(community_dict_bigger_than_one[j])) 

community_size = np.zeros(number_of_communities)

for i,j in enumerate(community_dict):
    community_size[i] = (len(community_dict[j]))
    
print('There is:', sum(community_size == 1), 'communities with only 1 member')
print('There is:', len(community_dict_bigger_than_one),'communities with more than 1 member')


There is: 9085 communities with only 1 member
There is: 1507 communities with more than 1 member


In [8]:
def clean_and_tokenize(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = nltk.tokenize.word_tokenize(text)# Tokenize text
    return tokens

In [9]:
## Tokenized words in each paper

if new_df:
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': "For stereoscopic optical see-through head-mounted display...
    abstracts = nx.get_node_attributes(G, 'abstract')
    
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': 'Modeling Physical Structure as Additional Constraints for Stereoscopic Optical See-Through Head-Mounted Display Calibration'
    titles = nx.get_node_attributes(G, 'title')

    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': title + abstract}
    paper_dict = {key: titles.get(key, '') + ' ' + abstracts.get(key, '') for key in set(abstracts) | set(titles)}
    
    # paper_dict_clean['01f02fae-97df-4207-a386-a1bc8ec0853b'] = <Text: modeling physical structure as additional constraints for stereoscopic...>
    paper_dict_clean = {key:  nltk.Text(clean_and_tokenize(text)) for key, text in paper_dict.items()}
    
    print('Pickeling...')
    pickle.dump(paper_dict_clean, open( "paper_dict_clean.pkl", "wb" ) )
    
else:
    with open("paper_dict_clean.pkl", 'rb') as f:
        paper_dict_clean = pickle.load(f)
    


In [None]:
community_texts = defaultdict(str)

if new_df:
    for cluster, papers_ids in community_dict_bigger_than_one.items():
        community_texts[cluster] = ' '.join([paper_dict_clean.get(paper_id[0], '') for paper_id in paper_ids])

# Convert defaultdict to a regular dictionary if needed
community_texts  = dict(community_texts)

In [43]:
paper_dict_clean['01f02fae-97df-4207-a386-a1bc8ec0853b']

<Text: modeling physical structure as additional constraints for stereoscopic...>

In [10]:
# TF for each party
TF_papers = {}
word_set = set()

for cluster, papers in community_dict_bigger_than_one.items():
    overall_freq = Counter()
    for paper in papers:
        
        try:
            fd = nltk.FreqDist(paper_dict_clean[paper[0]])
            word_set.update(set(list(fd.keys())))
            overall_freq = overall_freq + Counter(fd)

        except:
            print('Breaked')
            continue
    
    TF_papers[cluster] = overall_freq
    break

In [35]:
# {0: ['for', 'stereoscopic', 'optical'
if new_df:        
    # Apply the function to each text in community_text
    community_text_clean = {i: clean_and_tokenize(text) for i, text in paper_dict.items()}
    print('Pickling...')
    pickle.dump(community_text_clean, open( "community_text_clean.pkl", "wb" ) )
    
else:
    with open("community_text_clean.pkl", 'rb') as f:
        community_text_clean = pickle.load(f)
    

Pickling...


In [15]:

'''
text_dict = {}

for i in community_text:
    no_punct = re.sub(r'[^\w\s]','',community_text[i])
    no_punct = re.sub(r'\n' , ' ', no_punct)
    no_punct = no_punct.lower()
    tokens = nltk.word_tokenize(no_punct)
    text_dict[i] = nltk.Text(tokens)
    
pickle.dump(text_dict, open( "text_dict.pkl", "wb"))

'''


In [41]:

if new_df:
    text_dict = {i: nltk.Text(nltk.word_tokenize(re.sub(r'[^\w\s]|[\n]', ' ', community_text_clean[i]).lower())) for i in community_text_clean}
    
    print('Pickling...')
    pickle.dump(text_dict, open("text_dict.pkl", "wb"))
else:
    with open("text_dict.pkl", 'rb') as f:
        text_dict = pickle.load(f)

Pickling...


In [44]:
if new_df:
    # TF for each community
    TF = {}
    word_set = set()

    overall_freq = Counter()
    for i in community_text_clean:
        try:
            fd = nltk.FreqDist(text_dict[i])
            word_set.update(set(list(fd.keys())))
            overall_freq = overall_freq + Counter(fd)
        except:
            continue
    TF[i] = overall_freq
    
    print('Pickling...')    
    pickle.dump(TF, open("TF.pkl", "wb"))
    
else :
    with open("TF.pkl", 'rb') as f:
        TF = pickle.load(f)
    
    

Pickling...


In [54]:
k = 0
while(k<10):
    for i in community_text_clean:
        print(i)
        k = k + 1
        break


0
0
0
0
0
0
0
0
0
0


In [55]:
TF[10394]

Counter({'the': 14736989,
         'of': 9062346,
         'and': 6593378,
         'a': 6127813,
         'to': 5620823,
         'in': 4871016,
         'is': 3619174,
         'for': 3020755,
         'we': 2477653,
         'that': 2349677,
         'this': 2165290,
         'on': 1893960,
         'are': 1775239,
         'with': 1765873,
         'by': 1436728,
         'as': 1413875,
         'an': 1412250,
         'be': 1104291,
         'based': 1047889,
         'can': 1020342,
         'paper': 1006924,
         'which': 921669,
         'data': 875914,
         'from': 862708,
         'it': 753295,
         'system': 752193,
         'model': 702705,
         'our': 701317,
         'results': 683729,
         'proposed': 651919,
         'using': 650031,
         'time': 623431,
         'algorithm': 621893,
         'method': 589865,
         'performance': 557850,
         'approach': 539591,
         'or': 538947,
         'problem': 530292,
         'have': 523986,
 

In [None]:
def idf(word):
    occ = sum(1 for i in TF if word in TF[i])
    return np.log(10 / max(1, occ))

In [None]:
def tf_idf(number, com):    
    total_words = len(TF[number])
    return {word: (TF[number][word] / total_words) * idf(word) for word in com}

In [None]:
tf_idf_all_communities = [tf_idf(i, TF.get(i)) for i in TF]

In [None]:
pickle.dump(tf_idf_all_communities, open("tf_idf_all_communities.pkl", "wb")) 

In [None]:
def idf(word):
    occ = 0
    for i in TF:
        com = TF.get(i)
        if word in com.keys():
            occ += 1
    if occ == 0:
        occ = 1
    return np.log(10/occ)

In [None]:
def tf_idf(number, com):    
    vec = {}
    # Go through every word 
    for word in com.keys():

        # calculate the term frequency by dividing
        # the occurance of a certain word with the
        # total number of words in the commnity
        tf = TF[number][word]/len(TF[number])
        
        # calculates the inverse document frequency,
        idfreq = idf(word)

        # Then we multiply the two measures 
        val = tf*idfreq

        # save the result in a dictionary with the word
        # as the key and the TF-IDF score as the value
        vec[word] = val
    return vec

In [None]:
TF_IDF_all_communities = []
for i in TF:
    v = tf_idf(i, TF.get(i))
    TF_IDF_all_communities.append(v)

In [None]:
TF_IDF_all_communities


In [None]:
TF = defaultdict(Counter)
word_set = set()

for i in community_text:
    # Join all sentences into a single string, then split and count
    overall_freq = Counter(' '.join(community_text[i]).split())
    TF[i] = overall_freq
    word_set |= set(TF[i].keys())

In [None]:
DF = Counter()
for word in word_set:
    for i in community_text:
        if word in TF[i]:
            DF[word] += 1

# Calculate Inverse Document Frequency (IDF)
N = len(community_text)
IDF = {word: math.log(N / DF[word]) for word in DF}

In [None]:
# Calculate TF-IDF
TF_IDF = {i: {word: TF[i][word] * IDF[word] for word in TF[i]} for i in community_text}