In [1]:
%%capture
import itertools
import pickle 
import networkx as nx
from collections import deque, defaultdict, Counter
from tqdm import tqdm
import community as community_louvain
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import regex as re
import math
import nltk
nltk.download('punkt')


In [2]:
# Set if you're starting over
new_df = False

In [3]:
with open("base_graph.pkl", 'rb') as f:
    G_directed = pickle.load(f)
f.close()
G = G_directed.to_undirected()

In [4]:
if new_df:
    # compute the best partition
    partition = community_louvain.best_partition(G)

    # compute modularity
    mod = community_louvain.modularity(partition, G)

    number_of_communities = len(set(partition.values()))
    print('Using the Louvain algortihm we identified', number_of_communities, 'communities')
    
    print('Pickling...')
    pickle.dump(partition, open( "partition.pkl", "wb" ) )
    pickle.dump(mod, open( "mod.pkl", "wb" ) )
    
else:
    with open("partition.pkl", 'rb') as f:
        partition = pickle.load(f)
    
    with open("mod.pkl", 'rb') as f:
        mod = pickle.load(f)  
        
    number_of_communities = len(set(partition.values()))
    

In [None]:
def bfs_shortest_paths(G, root):
    shortest_paths_dict = {root: [[root]]}
    queue = deque([(root, [root])])

    while queue:
        s, path = queue.popleft()

        for neighbor in G.neighbors(s):
            new_path = path + [neighbor]
            old_path = shortest_paths_dict.get(neighbor, [[None] * (len(new_path) + 1)])

            if len(new_path) == len(old_path[0]):
                shortest_paths_dict[neighbor].append(new_path)
            elif len(new_path) < len(old_path[0]):
                shortest_paths_dict[neighbor] = [new_path]
                queue.append((neighbor, new_path))

    return shortest_paths_dict

def edge_betweenness_centrality(G):
    edge_betweenness = defaultdict(float)

    for node in G.nodes():
        shortest_paths_dict = bfs_shortest_paths(G, node)

        for paths in shortest_paths_dict.values():
            for path in paths:
                for i in range(len(path) - 1):
                    edge = (path[i], path[i + 1])
                    edge_betweenness[edge] += 1.0

    return edge_betweenness

def girvan_newman_directed(G):
    G_copy = G.copy()
    communities = list(nx.weakly_connected_components(G_copy))
    results = {0: communities}
    
    step = 1
    
    while G_copy.number_of_edges() > 0:
        edge_betweenness = edge_betweenness_centrality(G_copy)
        max_betweenness = max(edge_betweenness.values())
        highest_betweenness_edges = [edge for edge, value in edge_betweenness.items() if value == max_betweenness]
        G_copy.remove_edges_from(highest_betweenness_edges)
        components = list(nx.weakly_connected_components(G_copy))
        results[step] = components
        step += 1
    
    return results

def modularity(G, clusters_list):
    Q = 0
    m = len(list(G.edges()))
    for aCommunity in clusters_list:
        print("aCommunity", aCommunity)
        for v in list(aCommunity):
            for w in list(aCommunity):
                if v != w:
                    avw = 1 if (v,w) in list(G.edges()) or (w,v) in list(G.edges()) else 0               
                    new_term = avw - (G.degree(v)*G.degree(w))/(2*m)
                    Q += new_term
    return Q/(2*m)

def compute_modularity_for_all_communities(G, all_communities):
    result = []
    t = tqdm(total=len(list(all_communities.values())))
    for aCommunityRepartition in list(all_communities.values()):
        t.update()
        aModularity = modularity(G, aCommunityRepartition)
        result.append(
            [aCommunityRepartition, aModularity]
        )
    t.close    
    return result

    
B = nx.DiGraph()
all_nodes = ['A','B','C','D','E','F','G','H']
B.add_nodes_from(all_nodes)
all_edges = [
    ('E','D'),('E','F'),
    ('F','G'),('D','G'),('D','B'),('B','A'),('B','C'),('A','H'),
    ('D','F'),('A','C')
]
B.add_edges_from(all_edges)

print('Finding communities...')
all_com = girvan_newman_directed(B)

print('Finding the modularity...')    
all_clusters_with_modularity = compute_modularity_for_all_communities(B, all_com)

print('Sorting')
all_clusters_with_modularity.sort(key= lambda x:x[1], reverse=True)

print('Finding the best and pickling')
best_cluster = all_clusters_with_modularity[0]
print(best_cluster)
#pickle.dump(best_cluster, open( "best_cluster.pkl", "wb" ) )
    
    

In [5]:
## Dict where the key is the cluster number and the values
'''
community_dict[2] = [['27c5ea64-86cb-4e69-9d13-c8ba2654515d'],
 ['2ee9a087-6188-4ebd-95b9-6561cba0584c'],
 ['efe2dd1d-706c-4ab6-bd9b-90d35a81d04f']]
'''

community_dict = {new_list: [] for new_list in range(number_of_communities)}
for i, j in partition.items():  
    community_dict[j].append([i])
    
# Filter out communities with only one element
community_dict_bigger_than_one = {k: v for k, v in community_dict.items() if len(v) > 1}

### Community sizes

In [6]:
community_size_bigger_than_one = np.zeros(len(community_dict_bigger_than_one))

for i,j in enumerate(community_dict_bigger_than_one):
    community_size_bigger_than_one[i] = (len(community_dict_bigger_than_one[j])) 

community_size = np.zeros(number_of_communities)

for i,j in enumerate(community_dict):
    community_size[i] = (len(community_dict[j]))
    
print('There is:', sum(community_size == 1), 'communities with only 1 member')
print('There is:', len(community_dict_bigger_than_one),'communities with more than 1 member')


There is: 9085 communities with only 1 member
There is: 1507 communities with more than 1 member


### Clean and tokenize abstracts 

In [7]:
def clean_and_tokenize(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = nltk.tokenize.word_tokenize(text)# Tokenize text
    return tokens

Creating a dict for each **paper** with their words

In [62]:
## Tokenized words in each paper
if new_df:
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': "For stereoscopic optical see-through head-mounted display...
    abstracts = nx.get_node_attributes(G, 'abstract')
    
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': 'Modeling Physical Structure as Additional Constraints for Stereoscopic Optical See-Through Head-Mounted Display Calibration'
    titles = nx.get_node_attributes(G, 'title')

    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': title + abstract}
    paper_dict = {key: titles.get(key, '') + ' ' + abstracts.get(key, '') for key in set(abstracts) | set(titles)}
    
    # paper_dict_clean['345a2369-8198-46db-8ebb-b3f622b35381'] = ['scalable', 'feature', 'extraction', 'for', 'coarse',
    paper_dict_clean = {key:  clean_and_tokenize(text) for key, text in paper_dict.items()}
    
    print('Pickeling...')
    pickle.dump(paper_dict_clean, open( "paper_dict_clean.pkl", "wb" ) )
    pickle.dump(abstracts, open( "abstracts.pkl", "wb" ) )
    pickle.dump(titles, open( "titles.pkl", "wb" ) )
    
else:
    
    with open("paper_dict_clean.pkl", 'rb') as f:
        paper_dict_clean = pickle.load(f)
    
    with open("abstracts.pkl", 'rb') as f:
        abstracts = pickle.load(f)
    
    with open("titles.pkl", 'rb') as f:
        titles = pickle.load(f)
    


Pickeling...


Creating a dict for each **cluster** with all words from the papers it contains

In [9]:

if new_df:
    community_texts_clean = {new_list: [] for new_list in range(len(community_dict_bigger_than_one))}
    
    for cluster_id, paper_ids in enumerate(community_dict_bigger_than_one.values()):
        for paper_id in paper_ids:  
            community_texts_clean[cluster_id].extend(paper_dict_clean[paper_id[0]])          
    
    print('Pickling...')
    pickle.dump(community_text_clean, open( "community_text_clean.pkl", "wb" ) )
    
else:
    with open("community_text_clean.pkl", 'rb') as f:
        community_texts_clean = pickle.load(f)

Turn the dicts into nltk format

In [10]:
# Turn into nltk format
community_text_clean_text = { cluster_id: nltk.Text(text) for cluster_id, text in community_texts_clean.items() } 

# {'345a2369-8198-46db-8ebb-b3f622b35381': <Text: scalable feature extraction for coarse to fine jpeg...>,
paper_dict_clean_text = { paper_id: nltk.Text(text) for paper_id, text in paper_dict_clean.items() } 

# TF-IDF analysis

TF for each **community**, looking at the 'document' as a cluster 

In [11]:
# TF for each community
if new_df:
    
    # TF_clusters[100] = Counter({'the': 28, 'of': 23,'in': 14,'we': 13,
    TF_clusters = {}

    for cluster_, text in enumerate(community_text_clean_text.values()):
        overall_freq = Counter()           
        try:
            fd = nltk.FreqDist(text)
            overall_freq = overall_freq + Counter(fd)
        except:
            print('Breaked')
            continue
            
        TF_clusters[cluster_] = overall_freq

    pickle.dump(TF_clusters, open( "TF_clusters.pkl", "wb" ) )

else:
    with open("TF_clusters.pkl", 'rb') as f:
        TF_clusters = pickle.load(f)

TF for each **paper** inside their community, looking at the 'document' as the paper

In [12]:
if new_df:    
    TF_papers = {}

    for cluster_, paper_ids in enumerate(community_dict_bigger_than_one.values()): 
        TF_papers[cluster_] = {}
        
        for paper_id in paper_ids:         
            overall_freq = Counter()     
                    
            try:
                text = paper_dict_clean_text[paper_id[0]]
                fd = nltk.FreqDist(text)
                overall_freq = overall_freq + Counter(fd)
                
            except:
                print('Breaked')
                continue
            
            TF_papers[cluster_][paper_id[0]] = overall_freq
            
    pickle.dump(TF_papers, open( "TF_papers.pkl", "wb" ) )
    
else:
    with open("TF_papers.pkl", 'rb') as f:
        TF_papers = pickle.load(f)
       

### TF-IDF

TF-IDF for all communities

In [13]:
new_df = True

word_count = lambda word: sum(1 for i in TF_clusters if word in TF_clusters[i])

def tf_idf(cluster_id, counter):  
    # Total number of words in the cluster
    total_words = counter.total()
    
    return {word: (TF_clusters[cluster_id][word] / total_words) * np.log( N / word_count(word) ) for word in counter}

if new_df:

    # Number of clusters
    N = len(TF_clusters)
    
    tf_idf_all_communities = [tf_idf(i, TF_clusters[i]) for i in TF_clusters]
    
    print('Pickling...')
    pickle.dump(tf_idf_all_communities, open( "tf_idf_all_communities.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_communities.pkl", 'rb') as f:
        tf_idf_all_communities = pickle.load(f)
      

Pickling...


In [14]:
community = tf_idf_all_communities[6]
sorted(community, key=community.get, reverse=True)[:4]

['performance', 'memory', 'parallel', 'scheduling']

### TF-IDF for each paper

with table

In [15]:
if new_df:
    
    # Dict to store the word counts
    word_count_papers = defaultdict(lambda: defaultdict(int))

    for i in TF_papers:
        for paper in TF_papers[i].values():
            for word in paper:
                word_count_papers[i][word] += 1

    tf_idf_all_papers = {}
    
    for cluster_id, papers in tqdm(TF_papers.items(), desc="Processing clusters"):
        # Number of papers inside the current cluster 
        N = len(TF_papers[cluster_id])

        def tf_idf_papers(counter): 

            # Total words in the current paper
            total_words = counter.total()

            return {word: (counter[word] / total_words) * np.log( N / word_count_papers[cluster_id][word] ) for word in counter}

        tf_idf_all_papers[cluster_id] = {paper_id_: tf_idf_papers(counter_) for paper_id_, counter_ in papers.items()}
    
    print('Pickeling...')
    #pickle.dump(tf_idf_all_papers, open( "tf_idf_all_papers.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_papers.pkl", 'rb') as f:
        tf_idf_all_papers = pickle.load(f)

Processing clusters: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1507/1507 [02:43<00:00,  9.20it/s]

Pickeling...





In [16]:
community = tf_idf_all_papers[0]['01f02fae-97df-4207-a386-a1bc8ec0853b']
sorted(community, key=community.get, reverse=True)[:5]

['stereoscopic', 'physical', 'unreliable', 'calibration', 'user']

In [None]:
# {0: {'01f02fae-97df-4207-a386-a1bc8ec0853b': Counter({'the': 12,'for': 4,'user': 4,
# TF_papers

# User input!

TF-IDF for user-input. At cluster level.

In [76]:
text = input()
print('The user input: ', text)

def text_cluster(text):
    overall_freq = Counter() 
    text = nltk.Text(clean_and_tokenize(text))
           
    try:
        fd = nltk.FreqDist(text)
        overall_freq = overall_freq + Counter(fd)
        
    except:
        print('Breaked')
            
    return overall_freq

def text_idf(word):
    return max(1, sum(word in cluster for cluster in TF_clusters.values()))

def text_tf_idf(tf_words):
    tf_idf_scores = {}
    N = len(TF_clusters) 
    for word in tf_words:
        tf = TF_text[word]/TF_text.total()
        idfreq = np.log( N / text_idf(word) )
        tf_idf_scores[word] = tf*idfreq
            
    return tf_idf_scores

# TF for the input text
TF_text = text_cluster(text)

# TF-IDF for the input text on cluster level
cLevel_tf_idf = text_tf_idf(TF_text)

print('TF-IDF scores: ', cLevel_tf_idf)


The user input:  *Evaluating the Efficacy of Renewable Energy Sources in Reducing Carbon Emissions*    This study seeks to investigate the role of various renewable energy sources, such as solar, wind, and hydro, in mitigating carbon emissions, comparing their effectiveness to traditional fossil fuels. The research will cover energy output comparisons between renewable and non-renewable sources, analyze the lifecycle carbon footprint of these energy systems, and assess technological advancements and economic feasibility. Additionally, the impact of relevant policies and regulations on the adoption of renewable energy will be examined.
TF-IDF scores:  {'evaluating': 0.03673247808449794, 'the': 0.0002783279297465243, 'efficacy': 0.04219763656409952, 'of': 0.0003482578226355083, 'renewable': 0.21875163588006288, 'energy': 0.12321971903259707, 'sources': 0.10707142871780709, 'in': 0.0006732210257032231, 'reducing': 0.034888233547560286, 'carbon': 0.11880883232087265, 'emissions': 0.0868455

### Jaccard similarity

In [77]:
def jaccard_similarity(tf, keywords):
    set_input_words = set(tf.keys())
    set_keywords = set(keywords)
    
    intersection = len(set_input_words & set_keywords)
    union = len(set_input_words | set_keywords)
    
    similarity = intersection / union if union != 0 else 0
    return similarity

In [78]:
sim_cluster = {}
for i in TF_clusters:
    sim_cluster[i] = jaccard_similarity(TF_clusters[i], cLevel_tf_idf)
    
sorted_similarity = sorted(sim_cluster.items(), key=lambda x:x[1], reverse=True)[:5]
best_cluster, best_cluster_similarity = sorted_similarity[0]
print('The cluster choosen was:', best_cluster, '\nWith a jaccard similarity of:', best_cluster_similarity)

The cluster choosen was: 53 
With a jaccard similarity of: 0.12345679012345678


Words from the located cluster

In [79]:
community = tf_idf_all_communities[best_cluster]
sorted(community, key=community.get, reverse=True)[:20]

['turbines',
 'hydro',
 'feedback',
 'control',
 'nonlinear',
 'hydraulic',
 'controller',
 'locally',
 'electric',
 'computed',
 'controlled',
 'law',
 'strategy',
 'power',
 'system',
 'minimum',
 'phase',
 'turbine',
 'pid',
 'proposed']

TF-IDF inside the cluster

In [80]:
def text_paper(text):
    overall_freq = Counter() 
    text = nltk.Text(clean_and_tokenize(text))
           
    try:
        fd = nltk.FreqDist(text)
        overall_freq = overall_freq + Counter(fd)
        
    except:
        print('Breaked')
            
    return overall_freq

def text_idf_paper(word):
    return max(1,sum(word in cluster for cluster in TF_papers[best_cluster].values()))

def text_tf_idf_paper(tf_words, best_cluster):
    N = len(TF_papers[best_cluster])
    tf_idf_scores = {} 
    for word in tf_words:
        tf = TF_paper[word]/TF_paper.total()
        idfreq = np.log(N / text_idf_paper(word))
        tf_idf_scores[word] = tf*idfreq
            
    return tf_idf_scores

TF_paper = text_paper(text)
dLevel_tf_idf = text_tf_idf_paper(TF_paper, best_cluster)
print('TF-IDF scores for inside cluster: ', str(best_cluster), ', is ', dLevel_tf_idf)

TF-IDF scores for inside cluster:  53 , is  {'evaluating': 0.00805985093674355, 'the': 0.0, 'efficacy': 0.00805985093674355, 'of': 0.0, 'renewable': 0.04029925468371775, 'energy': 0.04029925468371775, 'sources': 0.02417955281023065, 'in': 0.0, 'reducing': 0.00805985093674355, 'carbon': 0.02417955281023065, 'emissions': 0.0161197018734871, 'this': 0.0, 'study': 0.00805985093674355, 'seeks': 0.00805985093674355, 'to': 0.0, 'investigate': 0.00805985093674355, 'role': 0.00805985093674355, 'various': 0.00805985093674355, 'such': 0.00805985093674355, 'as': 0.00805985093674355, 'solar': 0.00805985093674355, 'wind': 0.00805985093674355, 'and': 0.0, 'hydro': 0.00805985093674355, 'mitigating': 0.00805985093674355, 'comparing': 0.00805985093674355, 'their': 0.00805985093674355, 'effectiveness': 0.00805985093674355, 'traditional': 0.00805985093674355, 'fossil': 0.00805985093674355, 'fuels': 0.00805985093674355, 'research': 0.00805985093674355, 'will': 0.0161197018734871, 'cover': 0.008059850936743

In [81]:
len(tf_idf_all_papers[best_cluster])

2

In [82]:
sim_paper = {}
for i in TF_papers[best_cluster]:
    sim_paper[i] = jaccard_similarity(TF_papers[best_cluster][i], dLevel_tf_idf)
  
sorted_similarity_paper = sorted(sim_paper.items(), key=lambda x:x[1], reverse=True)[:10]
best_papers = sorted_similarity_paper[:10]
print('\n###   Papers with highest jaccard similarity   ###\n')
print('( Paper id, jaccard similarity score) ')
for paper in best_papers:
    print(paper)



###   Papers with highest jaccard similarity   ###

( Paper id, jaccard similarity score) 
('d0ebbecd-04e1-415f-96ed-7d3146197b59', 0.12)
('4f947c17-08d1-49cc-be5a-cb298730da5a', 0.09401709401709402)


In [83]:
for b in best_papers:
    print('\nTitle: ',titles[b[0]])
    print('Abstract: ', abstracts[b[0]])


Title:  Control of hydraulic turbine generators using exact feedback linearization
Abstract:  Feedback control of hydro turbines in power systems is important for the transient stability enhancement. However, the hydro-electric power system is hard to be controlled due to its inherent non-minimum phase property. While global control of such a non-minimum phase system remains a challenging problem, it is shown in this paper that locally, the hydro-electric power plant is feedback equivalent to a linear and controllable system, and hence can be locally controlled, in the new coordinates, by using a linear feedback design technique such as pole assignment. Simulation results in a single machine infinite bus system are provided to illustrate the effectiveness of the proposed nonlinear control scheme. Performance comparisons between a PID controller and the nonlinear controller are also given in the paper.

Title:  Hydraulic-turbine start-up with “S-shaped” characteristic
Abstract:  Fast r

In [90]:
text = input()
print('The user input: ', text)

# TF for the input text
TF_text = text_cluster(text)

# TF-IDF for the input text on cluster level
cLevel_tf_idf = text_tf_idf(TF_text)

# Similarity on cluster-level
sim_cluster = {}
for i in TF_clusters:
    sim_cluster[i] = jaccard_similarity(TF_clusters[i], cLevel_tf_idf)
    
sorted_similarity = sorted(sim_cluster.items(), key=lambda x:x[1], reverse=True)[:5]
best_cluster, best_cluster_similarity = sorted_similarity[0]
print('The cluster choosen was:', best_cluster, '\nWith a jaccard similarity of:', best_cluster_similarity, '\n')

# Similarity on paper-level
TF_paper = text_paper(text)
dLevel_tf_idf = text_tf_idf_paper(TF_paper, best_cluster)

sim_paper = {}
for i in TF_papers[best_cluster]:
    sim_paper[i] = jaccard_similarity(TF_papers[best_cluster][i], dLevel_tf_idf)
  
sorted_similarity_paper = sorted(sim_paper.items(), key=lambda x:x[1], reverse=True)[:10]
best_papers = sorted_similarity_paper[:10]

print('\n###   Papers with highest jaccard similarity   ###\n')
print('( Paper id, jaccard similarity score) ')
for paper in best_papers:
    print(paper)

print('\n###   Papers recommended   ###')
for b in best_papers:
    print('\nTitle: ',titles[b[0]])
    print('Abstract: ', abstracts[b[0]])


The user input:  birds butterflies frogs sun plants
The cluster choosen was: 1251 
With a jaccard similarity of: 0.014492753623188406 


###   Papers with highest jaccard similarity   ###

( Paper id, jaccard similarity score) 
('1bf1acfd-1fe0-4b1e-a652-dc40ac4c92ca', 0.02702702702702703)
('e5b2f2f2-d71c-41d0-bbb5-35fa37b32236', 0.0)

###   Papers recommended   ###

Title:  On perturbation bounds of generalized eigenvalues for diagonalizable pairs
Abstract:  The purpose of this paper is to study the perturbation of generalized eigenvalues. Two perturbation bounds of the diagonalizable pairs are given. These results extend the corresponding ones given by Sun (Math Numer Sinica 4:23–29, 1982).

Title:  On perturbations of matrix pencils with real spectra, a revisit
Abstract:  This paper continues earlier studies by Bhatia and Li on eigenvalue perturbation theory for diagonalizable matrix pencils having real spectra. A unifying framework for creating crucial perturbation equations is deve

# Iony's work work work

In [201]:
def term_freq(term, paper_:Counter):
    return paper_.get(term)/paper_.total()

def inv_doc_freq(term_, cluster_:dict):
    N = len(cluster_)
    docs =0
    for _, p_counter in cluster_.items():
        if term_ in p_counter:
            docs +=1
    docs_w_term = tuple((p_id,p_counter) for  p_id, p_counter in cluster_.items() if term_ in p_counter.keys())
    return np.log(N/len(docs_w_term))

tf_idf = lambda term, paper, cluster: term_freq(term, paper) * inv_doc_freq(term, cluster)

def paper_tfidf(paper_:Counter, cluster_: dict):
    terms_tfidf = Counter()
    for term in paper_:
        terms_tfidf[term] = tf_idf(term, paper_, cluster_)
    return terms_tfidf

In [133]:
cluster0_keys = TF_papers[0].keys()

print(list(cluster0_keys)[0])
print(list(cluster0_keys)[1])
print(list(cluster0_keys)[2])

cluster_dummy = {
    '01f02fae-97df-4207-a386-a1bc8ec0853b':TF_papers[0]['01f02fae-97df-4207-a386-a1bc8ec0853b'],
    '076bca9b-b30c-4c2c-8148-3b9b5d9aa939':TF_papers[0]['076bca9b-b30c-4c2c-8148-3b9b5d9aa939']
}

cluster = TF_papers[0]
TFIDFs = dict.fromkeys(cluster.keys())
for p_id, p_counter in cluster.items():
    TFIDFs[p_id] = paper_tfidf(p_counter, cluster)
    
c0_df = DataFrame(columns=list(TF_clusters[0].keys()))

01f02fae-97df-4207-a386-a1bc8ec0853b
076bca9b-b30c-4c2c-8148-3b9b5d9aa939


In [None]:
def idf_papers(word,papers):
    occ = sum(1 for paper in papers.values() if word in paper)
    return np.log(10 / occ)

def tf_idf_papers(cluster, paper_id, counter): 
    print(paper_id)
    total_words = len(counter)
    return {word: (counter[word] / total_words) * idf_papers(word, TF_papers[cluster]) for word in counter}

In [None]:
new_df = True

word_count_papers = lambda word: sum(1 for i in papers if word in papers[i])

def tf_idf_papers(paper_id, counter): 
    print(paper_id)
    
    # Total words in the current paper
    total_words = counter.total()
       
    return {word: (counter[word] / total_words) * np.log( N / word_count_papers(word) ) for word in counter}

if new_df:
    tf_idf_all_papers = {}
    for cluster_id, papers in tqdm(TF_papers.items(), desc="Processing clusters"):
        
        # Number of papers inside the currect cluster 
        N = len(TF_papers[cluster_id])
        
        tf_idf_all_papers[cluster_id] = [tf_idf_papers(paper_id_, counter_) for paper_id_, counter_ in papers.items()]
    
    pickle.dump(tf_idf_all_papers, open( "tf_idf_all_papers.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_papers.pkl", 'rb') as f:
        tf_idf_all_papers = pickle.load(f) 