In [1]:
%%capture
import itertools
import pickle 
import networkx as nx
from collections import deque, defaultdict, Counter
from tqdm import tqdm
import community as community_louvain
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import regex as re
import math
import nltk
nltk.download('punkt')

In [11]:
# Set if you're starting over
new_df = False

In [3]:
with open("base_graph.pkl", 'rb') as f:
    G_directed = pickle.load(f)
f.close()
G = G_directed.to_undirected()

In [4]:
if new_df:
    # compute the best partition
    partition = community_louvain.best_partition(G)

    # compute modularity
    mod = community_louvain.modularity(partition, G)

    number_of_communities = len(set(partition.values()))
    print('Using the Louvain algortihm we identified', number_of_communities, 'communities')
    
    print('Pickling...')
    pickle.dump(partition, open( "partition.pkl", "wb" ) )
    pickle.dump(mod, open( "mod.pkl", "wb" ) )
    
else:
    with open("partition.pkl", 'rb') as f:
        partition = pickle.load(f)
    
    with open("mod.pkl", 'rb') as f:
        mod = pickle.load(f)  
        
    number_of_communities = len(set(partition.values()))
    

In [None]:
def bfs_shortest_paths(G, root):
    shortest_paths_dict = {root: [[root]]}
    queue = deque([(root, [root])])

    while queue:
        s, path = queue.popleft()

        for neighbor in G.neighbors(s):
            new_path = path + [neighbor]
            old_path = shortest_paths_dict.get(neighbor, [[None] * (len(new_path) + 1)])

            if len(new_path) == len(old_path[0]):
                shortest_paths_dict[neighbor].append(new_path)
            elif len(new_path) < len(old_path[0]):
                shortest_paths_dict[neighbor] = [new_path]
                queue.append((neighbor, new_path))

    return shortest_paths_dict

def edge_betweenness_centrality(G):
    edge_betweenness = defaultdict(float)

    for node in G.nodes():
        shortest_paths_dict = bfs_shortest_paths(G, node)

        for paths in shortest_paths_dict.values():
            for path in paths:
                for i in range(len(path) - 1):
                    edge = (path[i], path[i + 1])
                    edge_betweenness[edge] += 1.0

    return edge_betweenness

def girvan_newman_directed(G):
    G_copy = G.copy()
    communities = list(nx.weakly_connected_components(G_copy))
    results = {0: communities}
    
    step = 1
    
    while G_copy.number_of_edges() > 0:
        edge_betweenness = edge_betweenness_centrality(G_copy)
        max_betweenness = max(edge_betweenness.values())
        highest_betweenness_edges = [edge for edge, value in edge_betweenness.items() if value == max_betweenness]
        G_copy.remove_edges_from(highest_betweenness_edges)
        components = list(nx.weakly_connected_components(G_copy))
        results[step] = components
        step += 1
    
    return results

def modularity(G, clusters_list):
    Q = 0
    m = len(list(G.edges()))
    for aCommunity in clusters_list:
        print("aCommunity", aCommunity)
        for v in list(aCommunity):
            for w in list(aCommunity):
                if v != w:
                    avw = 1 if (v,w) in list(G.edges()) or (w,v) in list(G.edges()) else 0               
                    new_term = avw - (G.degree(v)*G.degree(w))/(2*m)
                    Q += new_term
    return Q/(2*m)

def compute_modularity_for_all_communities(G, all_communities):
    result = []
    t = tqdm(total=len(list(all_communities.values())))
    for aCommunityRepartition in list(all_communities.values()):
        t.update()
        aModularity = modularity(G, aCommunityRepartition)
        result.append(
            [aCommunityRepartition, aModularity]
        )
    t.close    
    return result

    
B = nx.DiGraph()
all_nodes = ['A','B','C','D','E','F','G','H']
B.add_nodes_from(all_nodes)
all_edges = [
    ('E','D'),('E','F'),
    ('F','G'),('D','G'),('D','B'),('B','A'),('B','C'),('A','H'),
    ('D','F'),('A','C')
]
B.add_edges_from(all_edges)

print('Finding communities...')
all_com = girvan_newman_directed(B)

print('Finding the modularity...')    
all_clusters_with_modularity = compute_modularity_for_all_communities(B, all_com)

print('Sorting')
all_clusters_with_modularity.sort(key= lambda x:x[1], reverse=True)

print('Finding the best and pickling')
best_cluster = all_clusters_with_modularity[0]
print(best_cluster)
#pickle.dump(best_cluster, open( "best_cluster.pkl", "wb" ) )
    
    

In [5]:
## Dict where the key is the cluster number and the values
'''
community_dict[2] = [['27c5ea64-86cb-4e69-9d13-c8ba2654515d'],
 ['2ee9a087-6188-4ebd-95b9-6561cba0584c'],
 ['efe2dd1d-706c-4ab6-bd9b-90d35a81d04f']]
'''

community_dict = {new_list: [] for new_list in range(number_of_communities)}
for i, j in partition.items():  
    community_dict[j].append([i])
    
# Filter out communities with only one element
community_dict_bigger_than_one = {k: v for k, v in community_dict.items() if len(v) > 1}

### Community sizes

In [6]:
community_size_bigger_than_one = np.zeros(len(community_dict_bigger_than_one))

for i,j in enumerate(community_dict_bigger_than_one):
    community_size_bigger_than_one[i] = (len(community_dict_bigger_than_one[j])) 

community_size = np.zeros(number_of_communities)

for i,j in enumerate(community_dict):
    community_size[i] = (len(community_dict[j]))
    
print('There is:', sum(community_size == 1), 'communities with only 1 member')
print('There is:', len(community_dict_bigger_than_one),'communities with more than 1 member')


There is: 9085 communities with only 1 member
There is: 1507 communities with more than 1 member


### Clean and tokenize abstracts 

In [7]:
def clean_and_tokenize(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = nltk.tokenize.word_tokenize(text)# Tokenize text
    return tokens

Creating a dict for each **paper** with their words

In [8]:
## Tokenized words in each paper
if new_df:
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': "For stereoscopic optical see-through head-mounted display...
    abstracts = nx.get_node_attributes(G, 'abstract')
    
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': 'Modeling Physical Structure as Additional Constraints for Stereoscopic Optical See-Through Head-Mounted Display Calibration'
    titles = nx.get_node_attributes(G, 'title')

    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': title + abstract}
    paper_dict = {key: titles.get(key, '') + ' ' + abstracts.get(key, '') for key in set(abstracts) | set(titles)}
    
    # paper_dict_clean['345a2369-8198-46db-8ebb-b3f622b35381'] = ['scalable', 'feature', 'extraction', 'for', 'coarse',
    paper_dict_clean = {key:  clean_and_tokenize(text) for key, text in paper_dict.items()}
    
    print('Pickeling...')
    pickle.dump(paper_dict_clean, open( "paper_dict_clean.pkl", "wb" ) )
    
else:
    with open("paper_dict_clean.pkl", 'rb') as f:
        paper_dict_clean = pickle.load(f)
    


Creating a dict for each **cluster** with all words from the papers it contains

In [9]:
# Making
if new_df:
    community_texts_clean = {new_list: [] for new_list in range(len(community_dict_bigger_than_one))}
    
    for cluster_id, paper_ids in enumerate(community_dict_bigger_than_one.values()):
        for paper_id in paper_ids:  
            community_texts_clean[cluster_id].extend(paper_dict_clean[paper_id[0]])          
    
    #print('Pickling...')
    #pickle.dump(community_text_clean, open( "community_text_clean.pkl", "wb" ) )
    
else:
    with open("community_text_clean.pkl", 'rb') as f:
        community_texts_clean = pickle.load(f)

Turn the dicts into nltk format

In [10]:
# Turn into nltk format
community_text_clean_text = { cluster_id: nltk.Text(text) for cluster_id, text in community_texts_clean.items() } 

# {'345a2369-8198-46db-8ebb-b3f622b35381': <Text: scalable feature extraction for coarse to fine jpeg...>,
paper_dict_clean_text = { paper_id: nltk.Text(text) for paper_id, text in paper_dict_clean.items() } 

# TF-IDF analysis

TF-IDF for each **community**, looking at the 'document' as a cluster 

In [12]:
# TF for each community
if new_df:
    # TF_clusters[100] = Counter({'the': 28, 'of': 23,'in': 14,'we': 13,
    TF_clusters = {}
    word_set = set()

    for cluster_, text in enumerate(community_text_clean_text.values()):
        overall_freq = Counter()           
        try:
            fd = nltk.FreqDist(text)
            word_set.update(set(list(fd.keys())))
            overall_freq = overall_freq + Counter(fd)
        except:
            print('Breaked')
            continue
            
        TF_clusters[cluster] = overall_freq

    pickle.dump(TF_clusters, open( "TF_clusters.pkl", "wb" ) )

else:
    with open("TF_clusters.pkl", 'rb') as f:
        TF_clusters = pickle.load(f)

TF-IDF for each **paper** inside their community, looking at the 'document' as the paper

In [14]:
if new_df:    
    TF_papers = {}
    word_set = set()

    for cluster_, paper_ids in enumerate(community_dict_bigger_than_one.values()): 
        TF_papers[cluster_] = {}
        
        for paper_id in paper_ids:         
            overall_freq = Counter()     
                    
            try:
                text = paper_dict_clean_text[paper_id[0]]
                fd = nltk.FreqDist(text)
                word_set.update(set(list(fd.keys())))
                overall_freq = overall_freq + Counter(fd)
                
            except:
                print('Breaked')
                continue
            
            TF_papers[cluster_][paper_id[0]] = overall_freq
            
    pickle.dump(TF_papers, open( "TF_papers.pkl", "wb" ) )
    
else:
    with open("TF_papers.pkl", 'rb') as f:
        TF_papers = pickle.load(f)
       

### TF-IDF

TF-IDF for all communities

In [None]:
def idf(word, TF):
    occ = sum(1 for i in TF if word in TF[i])
    return np.log(10 / max(1, occ))

def tf_idf(number, com, TF):    
    total_words = len(TF[number])
    return {word: (TF[number][word] / total_words) * idf(word, TF) for word in com}

if new_df:
    tf_idf_all_communities = [tf_idf(i, TF_clusters.get(i), TF_clusters) for i in TF_clusters]
    
    print('Pickling...')
    pickle.dump(tf_idf_all_communities, open( "tf_idf_all_communities.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_communities.pkl", 'rb') as f:
        tf_idf_all_communities = pickle.load(f)

TF-IDF for each paper

In [None]:
def idf_papers(word,papers):
    occ = sum(1 for paper in papers.values() if word in paper)
    return np.log(10 / max(1, occ))

def tf_idf_papers(cluster, paper_id, counter): 
    print(paper_id)
    total_words = counter.total()
    return {word: (counter[word] / total_words) * idf_papers(word, TF_papers[cluster]) for word in counter}

if new_df:
    tf_idf_all_papers = {}
    for cluster_, papers in tqdm(TF_papers.items(), desc="Processing clusters"):
        tf_idf_all_papers[cluster_] = [tf_idf_papers(cluster_, paper_id, paper) for paper_id, paper in papers.items()]
    #for cluster in tqdm(TF_papers, desc="Processing clusters"):    
    #    tf_idf_all_papers[cluster] = [tf_idf_papers(cluster, i, TF_papers.get(cluster)[i]) for i in TF_papers[cluster]]
        
    pickle.dump(tf_idf_all_papers, open( "tf_idf_all_papers.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_papers.pkl", 'rb') as f:
        tf_idf_all_papers = pickle.load(f) 

## User input!!!

TF-IDF for user-input. At cluster level.

In [31]:
text = input()
print(text)

def text_cluster(text):
    overall_freq = Counter() 
    text = nltk.Text(clean_and_tokenize(text))
           
    try:
        fd = nltk.FreqDist(text)
        overall_freq = overall_freq + Counter(fd)
        
    except:
        print('Breaked')
    
            
def text_idf(word):
    occ = sum(word in cluster for cluster in TF_clusters.values())
    return np.log(10 / max(1, occ))

def text_tf_idf(tf_words):
    tf_idf_scores = {} 
    for word in tf_words:
        tf = TF_text[word]/len(TF_text)
        idfreq = text_idf(word)
        tf_idf_scores[word] = tf*idfreq
            
    return tf_idf_scores

# TF for the input text
TF_text = text_cluster(text)

# TF-IDF for the input text on cluster level
cLevel_tf_idf = text_tf_idf(TF_text)

print('TF-IDF scores: ', cLevel_tf_idf)

the inverse document frequency for each word
Term frequency:  Counter({'the': 1, 'inverse': 1, 'document': 1, 'frequency': 1, 'for': 1, 'each': 1, 'word': 1})
TF-IDF scores:  {'the': -0.7159002483770593, 'inverse': -0.2629356619139267, 'document': -0.1944252218765144, 'frequency': -0.4243449236528144, 'for': -0.7053353569037748, 'each': -0.502499691051188, 'word': -0.2270336007309401}


In [26]:
def idf_papers(word,papers):
    occ = sum(1 for paper in papers.values() if word in paper)
    return np.log(10 / max(1, occ))

def tf_idf_papers(cluster, paper_id, counter): 
    print(paper_id)
    total_words = len(counter)
    return {word: (counter[word] / total_words) * idf_papers(word, TF_papers[cluster]) for word in counter}

### Jaccard similarity

In [None]:
def jaccard_similarity(tf, keywords):
    set_input_words = set(tf.keys())
    set_keywords = set(keywords)
    
    intersection = len(set_input_words & set_keywords)
    union = len(set_input_words | set_keywords)
    
    similarity = intersection / union if union != 0 else 0
    return similarity

In [None]:
sim_cluster = {}
for i in TF_clusters:
    sim_cluster[i] = jaccard_similarity(TF_clusters[i], cLevel_tf_idf)
    
sorted_similarity = sorted(sim_cluster.items(), key=lambda x:x[1], reverse=True)[:5]
best_cluster, best_cluster_similarity = sorted_similarity[0]
print('The cluster choosen was:', best_cluster, '\nWith a jaccard similarity of:', best_cluster_similarity)

Words from the located cluster

In [None]:
community = tf_idf_all_communities[best_cluster]
sorted(community, key=community.get, reverse=True)[:20]

TF-IDF inside the cluster

In [None]:
def text_paper(text):
    overall_freq = Counter() 
    text = nltk.Text(clean_and_tokenize(text))
    
    try:
        fd = nltk.FreqDist(text)
        overall_freq = overall_freq + Counter(fd)
        
    except:
        print('Breaked')
            
    return overall_freq

def text_idf_paper(word):
    occ = sum(word in cluster for cluster in TF_papers[best_cluster].values())
    return np.log(10 / max(1, occ))

def text_tf_idf_paper(tf_words):
    tf_idf_scores = {} 
    for word in tf_words:
        tf = TF_papers[best_cluster][word]/len(TF_papers[best_cluster])
        idfreq = text_idf_paper(word)
        tf_idf_scores[word] = tf*idfreq
            
    return tf_idf_scores

TF_paper = text_paper(text)
dLevel_tf_idf = text_tf_idf(TF_paper)
print('TF-IDF scores for inside cluster: ', str(best_cluster), ', is ', dLevel_tf_idf)

In [None]:
sim_paper = {}
for i in TF_papers.get(best_cluster):
    sim_paper[i] = jaccard_similarity(TF_papers.get(best_cluster)[i], dLevel_tf_idf)
    
sorted_similarity_paper = sorted(sim_paper.items(), key=lambda x:x[1], reverse=True)[:5]
best_papers = sorted_similarity_paper[10]
print('The 10 best papers are ', best_papers.keys() , '\nWith a jaccard similarity of:', best_papers.values())

In [None]:
for b in best_papers.keys():
    print(titles[b])
    print(abstracts[b])

## Iony's work

In [None]:
def term_freq(term, paper_:Counter):
    return paper_.get(term)/paper_.total()

def inv_doc_freq(term_, cluster_:dict):
    docs =0 
    for _, p_counter in cluster_.items():
        if term_ in p_counter:
            docs +=1
    return np.log(len(cluster_)/docs)

tf_idf = lambda term, paper, cluster: term_freq(term, paper) * inv_doc_freq(term, cluster)

def paper_tfidf(paper_:Counter, cluster_: dict):
    terms_tfidf = Counter()
    for term in paper_:
        terms_tfidf[term] = tf_idf(term, paper_, cluster_)
    return terms_tfidf

In [None]:
cluster_dummy = {
    '01f02fae-97df-4207-a386-a1bc8ec0853b':TF_papers[0]['01f02fae-97df-4207-a386-a1bc8ec0853b'],
    '076bca9b-b30c-4c2c-8148-3b9b5d9aa939':TF_papers[0]['076bca9b-b30c-4c2c-8148-3b9b5d9aa939']
}

In [None]:
cluster = TF_papers[0]
TFIDFs = dict.fromkeys(cluster.keys())
for p_id, p_counter in cluster.items():
    TFIDFs[p_id] = paper_tfidf(p_counter, cluster)
    
c0_df = DataFrame(columns=list(TF_clusters[0].keys()))

# Things we don't think we need

In [34]:
if new_df:
    tf_idf_all_papers = {}
    for cluster in TF_papers:
        tf_idf_all_papers[cluster] = [tf_idf(i, TF_papers.get(cluster)[i], TF_papers.get(cluster)) for i in TF_papers.get(cluster)]
        
    #pickle.dump(tf_idf_all_papers, open( "tf_idf_all_papers.pkl", "wb" ) )
else:
    with open("tf_idf_all_papers.pkl", 'rb') as f:
        tf_idf_all_papers = pickle.load(f)
    

In [27]:
if new_df:
    tf_idf_all_communities = [tf_idf(i, TF_clusters.get(i), TF_clusters) for i in TF_clusters]
    
    print('Pickling...')
    pickle.dump(tf_idf_all_communities, open( "tf_idf_all_communities.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_communities.pkl", 'rb') as f:
        tf_idf_all_communities = pickle.load(f)
      


Pickling...
