In [1]:
%%capture
import itertools
import pickle 
import networkx as nx
from collections import deque, defaultdict, Counter
from tqdm import tqdm
import community as community_louvain
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import regex as re
import math
import nltk
nltk.download('punkt')


In [2]:
# Set if you're starting over
new_df = False

In [3]:
with open("base_graph.pkl", 'rb') as f:
    G_directed = pickle.load(f)
f.close()
G = G_directed.to_undirected()

In [4]:
if new_df:
    # compute the best partition
    partition = community_louvain.best_partition(G)

    # compute modularity
    mod = community_louvain.modularity(partition, G)

    number_of_communities = len(set(partition.values()))
    print('Using the Louvain algortihm we identified', number_of_communities, 'communities')
    
    print('Pickling...')
    pickle.dump(partition, open( "partition.pkl", "wb" ) )
    pickle.dump(mod, open( "mod.pkl", "wb" ) )
    
else:
    with open("partition.pkl", 'rb') as f:
        partition = pickle.load(f)
    
    with open("mod.pkl", 'rb') as f:
        mod = pickle.load(f)  
        
    number_of_communities = len(set(partition.values()))
    

In [None]:
def bfs_shortest_paths(G, root):
    shortest_paths_dict = {root: [[root]]}
    queue = deque([(root, [root])])

    while queue:
        s, path = queue.popleft()

        for neighbor in G.neighbors(s):
            new_path = path + [neighbor]
            old_path = shortest_paths_dict.get(neighbor, [[None] * (len(new_path) + 1)])

            if len(new_path) == len(old_path[0]):
                shortest_paths_dict[neighbor].append(new_path)
            elif len(new_path) < len(old_path[0]):
                shortest_paths_dict[neighbor] = [new_path]
                queue.append((neighbor, new_path))

    return shortest_paths_dict

def edge_betweenness_centrality(G):
    edge_betweenness = defaultdict(float)

    for node in G.nodes():
        shortest_paths_dict = bfs_shortest_paths(G, node)

        for paths in shortest_paths_dict.values():
            for path in paths:
                for i in range(len(path) - 1):
                    edge = (path[i], path[i + 1])
                    edge_betweenness[edge] += 1.0

    return edge_betweenness

def girvan_newman_directed(G):
    G_copy = G.copy()
    communities = list(nx.weakly_connected_components(G_copy))
    results = {0: communities}
    
    step = 1
    
    while G_copy.number_of_edges() > 0:
        edge_betweenness = edge_betweenness_centrality(G_copy)
        max_betweenness = max(edge_betweenness.values())
        highest_betweenness_edges = [edge for edge, value in edge_betweenness.items() if value == max_betweenness]
        G_copy.remove_edges_from(highest_betweenness_edges)
        components = list(nx.weakly_connected_components(G_copy))
        results[step] = components
        step += 1
    
    return results

def modularity(G, clusters_list):
    Q = 0
    m = len(list(G.edges()))
    for aCommunity in clusters_list:
        print("aCommunity", aCommunity)
        for v in list(aCommunity):
            for w in list(aCommunity):
                if v != w:
                    avw = 1 if (v,w) in list(G.edges()) or (w,v) in list(G.edges()) else 0               
                    new_term = avw - (G.degree(v)*G.degree(w))/(2*m)
                    Q += new_term
    return Q/(2*m)

def compute_modularity_for_all_communities(G, all_communities):
    result = []
    t = tqdm(total=len(list(all_communities.values())))
    for aCommunityRepartition in list(all_communities.values()):
        t.update()
        aModularity = modularity(G, aCommunityRepartition)
        result.append(
            [aCommunityRepartition, aModularity]
        )
    t.close    
    return result


print('Finding communities...')
all_com = girvan_newman_directed(G)

print('Finding the modularity...')    
all_clusters_with_modularity = compute_modularity_for_all_communities(G, all_com)

print('Sorting')
all_clusters_with_modularity.sort(key= lambda x:x[1], reverse=True)

print('Finding the best and pickling')
best_cluster = all_clusters_with_modularity[0]
print(best_cluster)
#pickle.dump(best_cluster, open( "best_cluster.pkl", "wb" ) )
    

In [5]:
## Dict where the key is the cluster number and the values
'''
community_dict[2] = [['27c5ea64-86cb-4e69-9d13-c8ba2654515d'],
 ['2ee9a087-6188-4ebd-95b9-6561cba0584c'],
 ['efe2dd1d-706c-4ab6-bd9b-90d35a81d04f']]
'''

community_dict = {new_list: [] for new_list in range(number_of_communities)}
for i, j in partition.items():  
    community_dict[j].append([i])
    
# Filter out communities with only one element
community_dict_bigger_than_one = {k: v for k, v in community_dict.items() if len(v) > 1}

### Community sizes

In [6]:
community_size_bigger_than_one = np.zeros(len(community_dict_bigger_than_one))

for i,j in enumerate(community_dict_bigger_than_one):
    community_size_bigger_than_one[i] = (len(community_dict_bigger_than_one[j])) 

community_size = np.zeros(number_of_communities)

for i,j in enumerate(community_dict):
    community_size[i] = (len(community_dict[j]))
    
print('There is:', sum(community_size == 1), 'communities with only 1 member')
print('There is:', len(community_dict_bigger_than_one),'communities with more than 1 member')


There is: 9085 communities with only 1 member
There is: 1515 communities with more than 1 member


### Clean and tokenize abstracts 

In [7]:
def clean_and_tokenize(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = nltk.tokenize.word_tokenize(text)# Tokenize text
    return tokens

Creating a dict for each **paper** with their words

In [8]:
## Tokenized words in each paper
if new_df:
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': "For stereoscopic optical see-through head-mounted display...
    abstracts = nx.get_node_attributes(G, 'abstract')
    
    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': 'Modeling Physical Structure as Additional Constraints for Stereoscopic Optical See-Through Head-Mounted Display Calibration'
    titles = nx.get_node_attributes(G, 'title')

    # {'01f02fae-97df-4207-a386-a1bc8ec0853b': title + abstract}
    paper_dict = {key: titles.get(key, '') + ' ' + abstracts.get(key, '') for key in set(abstracts) | set(titles)}
    
    # paper_dict_clean['345a2369-8198-46db-8ebb-b3f622b35381'] = ['scalable', 'feature', 'extraction', 'for', 'coarse',
    paper_dict_clean = {key:  clean_and_tokenize(text) for key, text in paper_dict.items()}
    
    print('Pickeling...')
    pickle.dump(paper_dict_clean, open( "paper_dict_clean.pkl", "wb" ) )
    pickle.dump(abstracts, open( "abstracts.pkl", "wb" ) )
    pickle.dump(titles, open( "titles.pkl", "wb" ) )
    
else:
    
    with open("paper_dict_clean.pkl", 'rb') as f:
        paper_dict_clean = pickle.load(f)
    
    with open("abstracts.pkl", 'rb') as f:
        abstracts = pickle.load(f)
    
    with open("titles.pkl", 'rb') as f:
        titles = pickle.load(f)
    


Creating a dict for each **cluster** with all words from the papers it contains

In [9]:

if new_df:
    community_texts_clean = {new_list: [] for new_list in range(len(community_dict_bigger_than_one))}
    
    for cluster_id, paper_ids in enumerate(community_dict_bigger_than_one.values()):
        for paper_id in paper_ids:  
            community_texts_clean[cluster_id].extend(paper_dict_clean[paper_id[0]])          
    
    print('Pickling...')
    pickle.dump(community_texts_clean, open( "community_texts_clean.pkl", "wb" ) )
    
else:
    with open("community_texts_clean.pkl", 'rb') as f:
        community_texts_clean = pickle.load(f)

Turn the dicts into nltk format

In [10]:
# Turn into nltk format
community_text_clean_text = { cluster_id: nltk.Text(text) for cluster_id, text in community_texts_clean.items() } 

# {'345a2369-8198-46db-8ebb-b3f622b35381': <Text: scalable feature extraction for coarse to fine jpeg...>,
paper_dict_clean_text = { paper_id: nltk.Text(text) for paper_id, text in paper_dict_clean.items() } 

# TF-IDF analysis

TF for each **community**, looking at the 'document' as a cluster 

In [11]:
# TF for each community
if new_df:
    
    # TF_clusters[100] = Counter({'the': 28, 'of': 23,'in': 14,'we': 13,
    TF_clusters = {}

    for cluster_, text in enumerate(community_text_clean_text.values()):
        overall_freq = Counter()           
        try:
            fd = nltk.FreqDist(text)
            overall_freq = overall_freq + Counter(fd)
        except:
            print('Breaked')
            continue
            
        TF_clusters[cluster_] = overall_freq

    pickle.dump(TF_clusters, open( "TF_clusters.pkl", "wb" ) )

else:
    with open("TF_clusters.pkl", 'rb') as f:
        TF_clusters = pickle.load(f)

TF for each **paper** inside their community, looking at the 'document' as the paper

In [12]:
if new_df:    
    TF_papers = {}

    for cluster_, paper_ids in enumerate(community_dict_bigger_than_one.values()): 
        TF_papers[cluster_] = {}
        
        for paper_id in paper_ids:         
            overall_freq = Counter()     
                    
            try:
                text = paper_dict_clean_text[paper_id[0]]
                fd = nltk.FreqDist(text)
                overall_freq = overall_freq + Counter(fd)
                
            except:
                print('Breaked')
                continue
            
            TF_papers[cluster_][paper_id[0]] = overall_freq
            
    pickle.dump(TF_papers, open( "TF_papers.pkl", "wb" ) )
    
else:
    with open("TF_papers.pkl", 'rb') as f:
        TF_papers = pickle.load(f)
       

### TF-IDF

TF-IDF for all communities

In [13]:
new_df = True

word_count = lambda word: sum(1 for i in TF_clusters if word in TF_clusters[i])

def tf_idf(cluster_id, counter):  
    # Total number of words in the cluster
    total_words = counter.total()
    
    return {word: (TF_clusters[cluster_id][word] / total_words) * np.log( N / word_count(word) ) for word in counter}

if new_df:

    # Number of clusters
    N = len(TF_clusters)
    
    tf_idf_all_communities = [tf_idf(i, TF_clusters[i]) for i in TF_clusters]
    
    print('Pickling...')
    pickle.dump(tf_idf_all_communities, open( "tf_idf_all_communities.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_communities.pkl", 'rb') as f:
        tf_idf_all_communities = pickle.load(f)
      

Pickling...


including top n

In [103]:
def get_top_n_items(dict_, n):
    # Sort the dictionary by value in descending order and get the top n items
    top_n_items = sorted(dict_.items(), key=lambda x: x[1], reverse=True)[:n]
    
    # Convert the list of tuples back to a dictionary
    return dict(top_n_items)

def tf_idf(cluster_id, counter):  
    # Total number of words in the cluster
    total_words = counter.total()
    
    return {word: (TF_clusters[cluster_id][word] / total_words) * np.log( N / word_count(word) ) for word in counter}

if new_df:

    # Number of clusters
    N = len(TF_clusters)
    
    tf_idf_all_communities = {i: tf_idf(i, TF_clusters[i]) for i in TF_clusters}
    top_tf_idf_all_communities = {i: get_top_n_items(tf_idf_all_communities[i], 40) for i in tf_idf_all_communities}
    
    print('Pickling...')
    pickle.dump(tf_idf_all_communities, open( "tf_idf_all_communities.pkl", "wb" ) )
    pickle.dump(top_tf_idf_all_communities, open( "top_tf_idf_all_communities.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_communities.pkl", 'rb') as f:
        tf_idf_all_communities = pickle.load(f)
    with open("top_tf_idf_all_communities.pkl", 'rb') as f:
        top_tf_idf_all_communities = pickle.load(f)

Pickling...


### TF-IDF for each paper

with table

In [15]:
if new_df:
    
    # Dict to store the word counts
    word_count_papers = defaultdict(lambda: defaultdict(int))

    for i in TF_papers:
        for paper in TF_papers[i].values():
            for word in paper:
                word_count_papers[i][word] += 1

    tf_idf_all_papers = {}
    
    for cluster_id, papers in tqdm(TF_papers.items(), desc="Processing clusters"):
        # Number of papers inside the current cluster 
        N = len(TF_papers[cluster_id])

        def tf_idf_papers(counter): 

            # Total words in the current paper
            total_words = counter.total()

            return {word: (counter[word] / total_words) * np.log( N / word_count_papers[cluster_id][word] ) for word in counter}

        tf_idf_all_papers[cluster_id] = {paper_id_: tf_idf_papers(counter_) for paper_id_, counter_ in papers.items()}
    
    print('Pickeling...')
    #pickle.dump(tf_idf_all_papers, open( "tf_idf_all_papers.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_papers.pkl", 'rb') as f:
        tf_idf_all_papers = pickle.load(f)

Processing clusters:   0%|                                                                                                                                            | 1/1515 [00:11<5:00:00, 11.89s/it]

Processing clusters: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1515/1515 [02:48<00:00,  9.00it/s]

Pickeling...





including the top n

In [104]:
def get_top_n_items(dict_, n):
    # Sort the dictionary by value in descending order and get the top n items
    top_n_items = sorted(dict_.items(), key=lambda x: x[1], reverse=True)[:n]
    
    # Convert the list of tuples back to a dictionary
    return dict(top_n_items)

def tf_idf_papers(counter): 
    # Total words in the current paper
    total_words = counter.total()

    return {word: (counter[word] / total_words) * np.log( N / word_count_papers[cluster_id][word] ) for word in counter}

if new_df:
    # Dict to store the word counts
    word_count_papers = defaultdict(lambda: defaultdict(int))

    for i in TF_papers:
        for paper in TF_papers[i].values():
            for word in paper:
                word_count_papers[i][word] += 1

    tf_idf_all_papers = {}
    top_tf_idf_all_papers = {}

    for cluster_id, papers in tqdm(TF_papers.items(), desc="Processing clusters"):
        # Number of papers inside the current cluster 
        N = len(TF_papers[cluster_id])

        tf_idf_all_papers[cluster_id] = {paper_id_: tf_idf_papers(counter_) for paper_id_, counter_ in papers.items()}
        top_tf_idf_all_papers[cluster_id] = {paper_id_: get_top_n_items(tf_idf_all_papers[cluster_id][paper_id_], 40) for paper_id_ in tf_idf_all_papers[cluster_id]}

    print('Pickling...')
    pickle.dump(tf_idf_all_papers, open( "tf_idf_all_papers.pkl", "wb" ) )
    pickle.dump(top_tf_idf_all_papers, open( "top_tf_idf_all_papers.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_papers.pkl", 'rb') as f:
        tf_idf_all_papers = pickle.load(f)
    with open("top_tf_idf_all_papers.pkl", 'rb') as f:
        top_tf_idf_all_papers = pickle.load(f)

Processing clusters: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1515/1515 [03:03<00:00,  8.25it/s]


Pickling...


OSError: [Errno 28] No space left on device

See which ones

In [16]:
community = tf_idf_all_papers[0]['01f02fae-97df-4207-a386-a1bc8ec0853b']
sorted(community, key=community.get, reverse=True)[:5]

['stereoscopic', 'calibration', 'unreliable', 'eyes', 'optical']

# User input!

TF-IDF for user-input. At cluster level.

In [108]:
text = input()
print('The user input: ', text)

def text_cluster(text):
    overall_freq = Counter() 
    text = nltk.Text(clean_and_tokenize(text))
           
    try:
        fd = nltk.FreqDist(text)
        overall_freq = overall_freq + Counter(fd)
        
    except:
        print('Breaked')
            
    return overall_freq

def text_idf(word):
    return max(1, sum(word in cluster for cluster in TF_clusters.values()))

def text_tf_idf(tf_words):
    tf_idf_scores = {}
    N = len(TF_clusters) 
    for word in tf_words:
        tf = TF_text[word]/TF_text.total()
        idfreq = np.log( N / text_idf(word) )
        tf_idf_scores[word] = tf*idfreq
            
    return tf_idf_scores

# TF for the input text
TF_text = text_cluster(text)

# TF-IDF for the input text on cluster level
cLevel_tf_idf = text_tf_idf(TF_text)

print('TF-IDF scores: ', cLevel_tf_idf)

cLevel_sorted = sorted(cLevel_tf_idf.items(), key=lambda x:x[1], reverse=True)[:40]
print(cLevel_sorted[:10])

The user input:  *Assessing the Impact of Climate Change on Coral Reef Ecosystems*    This project aims to explore and synthesize the current body of research concerning the effects of climate change on coral reef ecosystems across the globe. The focus will be on understanding how rising sea temperatures and ocean acidification are impacting coral biodiversity, altering reef structures, and affecting the adaptive mechanisms of various coral species. Additionally, this project will delve into the long-term ecological implications of these changes on marine life and ecosystem health.
TF-IDF scores:  {'assessing': 0.04134551328849299, 'the': 0.00033068826463505583, 'impact': 0.028962161782401373, 'of': 0.0003151499593436911, 'climate': 0.09040024269826924, 'change': 0.059401356681932575, 'on': 0.009270227250639671, 'coral': 0.2060684973518799, 'reef': 0.16728976386886465, 'ecosystems': 0.0941874973323094, 'this': 0.003262533652884385, 'project': 0.06593556729387924, 'aims': 0.036774573968

### Jaccard similarity

In [126]:
def jaccard_similarity(tfidf, keywords):
    keywords_40 = [x[0] for x in keywords]
    set_input_words = set(tfidf.keys())
    set_keywords = set(keywords_40)
    
    intersection = len(set_input_words & set_keywords)
    union = len(set_input_words | set_keywords)
    
    similarity = intersection / union if union != 0 else 0
    return similarity

In [119]:
top_tf_idf_all_communities[100].keys()

dict_keys(['flow', 'binding', 'heparin', 'growth', 'hollow', 'capture', 'fgf', 'fibers', 'factor', 'endothelial', 'pulsatile', 'heparan', 'receptor', 'circulation', 'sulfate', 'fibroblast', 'equations', 'bioreactor', 'predicted', 'rates', 'dissociation', 'pdes', 'odes', 'experimental', 'proteins', 'fgfr', 'biologicals', 'fiber', 'delivery', 'synthetic', 'model', 'hspg', 'reactions', 'was', 'proteoglycan', 'proteoglycans', 'quantitative', 'under', 'transport', 'cells'])

In [127]:
sim_cluster = {}
for i in range(len(tf_idf_all_communities)):
    sim_cluster[i] = jaccard_similarity(top_tf_idf_all_communities[i], cLevel_sorted)
    
sorted_similarity = sorted(sim_cluster.items(), key=lambda x:x[1], reverse=True)[:5]
best_cluster, best_cluster_similarity = sorted_similarity[0]
print('The cluster choosen was:', best_cluster, '\nWith a jaccard similarity of:', best_cluster_similarity)

The cluster choosen was: 585 
With a jaccard similarity of: 0.05263157894736842


Words from the located cluster

In [93]:
#[x[0] for x in tf_idf_all_communities[best_cluster]]

TF-IDF inside the cluster

In [132]:
def text_paper(text):
    overall_freq = Counter() 
    text = nltk.Text(clean_and_tokenize(text))
           
    try:
        fd = nltk.FreqDist(text)
        overall_freq = overall_freq + Counter(fd)
        
    except:
        print('Breaked')
            
    return overall_freq

def text_idf_paper(word):
    return max(1,sum(word in cluster for cluster in TF_papers[best_cluster].values()))

def text_tf_idf_paper(tf_words, best_cluster):
    N = len(TF_papers[best_cluster])
    tf_idf_scores = {} 
    for word in tf_words:
        tf = TF_paper[word]/TF_paper.total()
        idfreq = np.log(N / text_idf_paper(word))
        tf_idf_scores[word] = tf*idfreq
            
    return tf_idf_scores

TF_paper = text_paper(text)
dLevel_tf_idf = text_tf_idf_paper(TF_paper, best_cluster)
dLevel_sorted = sorted(dLevel_tf_idf.items(), key=lambda x:x[1], reverse=True)[:40]
print('TF-IDF scores for inside cluster: ', str(best_cluster), ', is ', dLevel_sorted)

TF-IDF scores for inside cluster:  585 , is  [('coral', 0.03300700859809263), ('reef', 0.024755256448569473), ('ecosystems', 0.016503504299046314), ('project', 0.016503504299046314), ('will', 0.016503504299046314), ('assessing', 0.008251752149523157), ('aims', 0.008251752149523157), ('explore', 0.008251752149523157), ('synthesize', 0.008251752149523157), ('current', 0.008251752149523157), ('body', 0.008251752149523157), ('concerning', 0.008251752149523157), ('effects', 0.008251752149523157), ('across', 0.008251752149523157), ('globe', 0.008251752149523157), ('focus', 0.008251752149523157), ('be', 0.008251752149523157), ('understanding', 0.008251752149523157), ('how', 0.008251752149523157), ('rising', 0.008251752149523157), ('sea', 0.008251752149523157), ('temperatures', 0.008251752149523157), ('ocean', 0.008251752149523157), ('acidification', 0.008251752149523157), ('impacting', 0.008251752149523157), ('biodiversity', 0.008251752149523157), ('altering', 0.008251752149523157), ('structu

In [102]:
for i in tf_idf_all_papers[best_cluster]:
    print(i)
    break

73d121f9-0014-434c-b689-e1024ebb95f8


In [131]:
#top_tf_idf_all_papers[best_cluster]

In [134]:
sim_paper = {}
for i in TF_papers[best_cluster]:
    sim_paper[i] = jaccard_similarity(top_tf_idf_all_papers[best_cluster][i], dLevel_sorted)
  
sorted_similarity_paper = sorted(sim_paper.items(), key=lambda x:x[1], reverse=True)[:10]
best_papers = sorted_similarity_paper[:10]
print('\n###   Papers with highest jaccard similarity   ###\n')
print('( Paper id, jaccard similarity score) ')
for paper in best_papers:
    print(paper)



###   Papers with highest jaccard similarity   ###

( Paper id, jaccard similarity score) 
('73d121f9-0014-434c-b689-e1024ebb95f8', 0.03896103896103896)
('f188c8be-987b-4fce-984a-95a5105e5a8e', 0.012658227848101266)


In [135]:
for b in best_papers:
    print('\nTitle: ',titles[b[0]])
    print('Abstract: ', abstracts[b[0]])


Title:  Systematically linking qualitative elements of scenarios across levels, scales, and sectors
Abstract:  New scenarios for climate change research connect climate model results based on Representative Concentration Pathways to nested interpretations of Shared Socioeconomic Pathways. Socioeconomic drivers of emissions and determinants of impacts are now decoupled from climate model outputs. To retain scenario credibility, more internally consistent linking across scales must be achieved. This paper addresses this need, demonstrating a modification to cross impact balances (CIB), a method for systematically deriving qualitative socioeconomic scenarios. Traditionally CIB is performed with one cross-impact matrix. This poses limitations, as more than a few dozen scenario elements with sufficiently varied outcomes can become computationally infeasible to comprehensively explore. Through this paper, we introduce the concept of 'linked CIB', which takes the structure of judgements for 

# Merged user input box

In [141]:
text = input()
print('The user input:\n', text)

# TF for the input text
TF_text = text_cluster(text)

# TF-IDF for the input text on cluster level
cLevel_tf_idf = text_tf_idf(TF_text)
cLevel_sorted = sorted(cLevel_tf_idf.items(), key=lambda x:x[1], reverse=True)[:40]
print('10 best TF-IDF scores:')
print(cLevel_sorted[:10])

# Similarity on cluster-level
sim_cluster = {}
for i in range(len(tf_idf_all_communities)):
    sim_cluster[i] = jaccard_similarity(top_tf_idf_all_communities[i], cLevel_sorted)
    
sorted_similarity = sorted(sim_cluster.items(), key=lambda x:x[1], reverse=True)[:5]
    
c1, c2, c3 = sorted_similarity[0][0], sorted_similarity[1][0], sorted_similarity[2][0]

print('## Best matching clusters: ##')
print(c1, c2, c3)

# Similarity on paper-level
TF_paper = text_paper(text)
dLevel_sorted = sorted(dLevel_tf_idf.items(), key=lambda x:x[1], reverse=True)[:40]

c1dLevel_tf_idf = text_tf_idf_paper(TF_paper, c1)
c1dLevel_sorted = sorted(c1dLevel_tf_idf.items(), key=lambda x:x[1], reverse=True)[:40]

c2dLevel_tf_idf = text_tf_idf_paper(TF_paper,c2)
c2dLevel_sorted = sorted(c2dLevel_tf_idf.items(), key=lambda x:x[1], reverse=True)[:40]

c3dLevel_tf_idf = text_tf_idf_paper(TF_paper, c3)
c3dLevel_sorted = sorted(c3dLevel_tf_idf.items(), key=lambda x:x[1], reverse=True)[:40]


#for i in TF_papers[best_cluster]:
#    sim_paper[i] = jaccard_similarity(top_tf_idf_all_papers[best_cluster][i], dLevel_sorted)
 
clusters = [c1,c2,c3]
sim_papers = {}
 
for i, cluster in enumerate(clusters):
    sim_papers[i] = {}
    for n in TF_papers[cluster]:
        sim_papers[i] = {jaccard_similarity(top_tf_idf_all_papers[cluster][n], dLevel_sorted)}

print(sim_papers)
sorted_similarity_papers = {}
for i, sim_paper in enumerate(sim_papers):
    sorted_similarity_papers[i] = sorted(sim_papers[i].items(), key=lambda x:x[1], reverse=True)[:10]
    
for i, cluster in enumerate(clusters):
    print('\n###   Cluster', i, '- Papers with highest jaccard similarity   ###\n')
    print('( Paper id, jaccard similarity score) ')
    for paper in sorted_similarity_papers[i]:
        print(paper)

for i, cluster in enumerate(clusters):
    print('\n###   Papers recommended   ###')
    for b in sorted_similarity_papers[i]:
        print('\nTitle: ',titles[b[0]])
        print('Abstract: ', abstracts[b[0]])
#c1sorted_similarity_paper = sorted(c1sim_paper.items(), key=lambda x:x[1], reverse=True)[:10]
#c2sorted_similarity_paper = sorted(c2sim_paper.items(), key=lambda x:x[1], reverse=True)[:10]
#c3sorted_similarity_paper = sorted(c3sim_paper.items(), key=lambda x:x[1], reverse=True)[:10]

'''
c1sim_paper = {}
for i in TF_papers[c1]:
    c1sim_paper[i] = jaccard_similarity(top_tf_idf_all_papers[c1][i], dLevel_tf_idf)
    
c2sim_paper = {}
for i in TF_papers[c2]:
    c2sim_paper[i] = jaccard_similarity(top_tf_idf_all_papers[c2][i], dLevel_tf_idf)
    
c3sim_paper = {}
for i in TF_papers[c3]:
    c3sim_paper[i] = jaccard_similarity(top_tf_idf_all_papers[c3][i], dLevel_tf_idf)


c1best_papers = c1sorted_similarity_paper[:5]
c2best_papers = c2sorted_similarity_paper[:5]
c3best_papers = c3sorted_similarity_paper[:5]

print('\n###   Cluster 1 - Papers with highest jaccard similarity   ###\n')
print('( Paper id, jaccard similarity score) ')
for paper in c1best_papers:
    print(paper)
    
print('\n###   Cluster 2 - Papers with highest jaccard similarity   ###\n')
print('( Paper id, jaccard similarity score) ')
for paper in c2best_papers:
    print(paper)
    
print('\n###   Cluster 3 - Papers with highest jaccard similarity   ###\n')
print('( Paper id, jaccard similarity score) ')
for paper in c3best_papers:
    print(paper)

print('\n###   Papers recommended   ###')
for b in c1best_papers:
    print('\nTitle: ',titles[b[0]])
    print('Abstract: ', abstracts[b[0]])

print('\n###   Papers recommended   ###')
for b in c2best_papers:
    print('\nTitle: ',titles[b[0]])
    print('Abstract: ', abstracts[b[0]])
    
print('\n###   Papers recommended   ###')
for b in c3best_papers:
    print('\nTitle: ',titles[b[0]])
    print('Abstract: ', abstracts[b[0]])
''' 

The user input:
 *Assessing the Impact of Climate Change on Coral Reef Ecosystems*    This project aims to explore and synthesize the current body of research concerning the effects of climate change on coral reef ecosystems across the globe. The focus will be on understanding how rising sea temperatures and ocean acidification are impacting coral biodiversity, altering reef structures, and affecting the adaptive mechanisms of various coral species. Additionally, this project will delve into the long-term ecological implications of these changes on marine life and ecosystem health.
10 best TF-IDF scores:
[('coral', 0.2060684973518799), ('reef', 0.16728976386886465), ('ecosystems', 0.0941874973323094), ('climate', 0.09040024269826924), ('project', 0.06593556729387924), ('acidification', 0.06401500677247804), ('change', 0.059401356681932575), ('biodiversity', 0.05093628905023865), ('delve', 0.04985329169064666), ('impacting', 0.04839374023716651)]
## Best matching clusters: ##
585 616 33

AttributeError: 'set' object has no attribute 'items'

# Junk

In [201]:
def term_freq(term, paper_:Counter):
    return paper_.get(term)/paper_.total()

def inv_doc_freq(term_, cluster_:dict):
    N = len(cluster_)
    docs =0
    for _, p_counter in cluster_.items():
        if term_ in p_counter:
            docs +=1
    docs_w_term = tuple((p_id,p_counter) for  p_id, p_counter in cluster_.items() if term_ in p_counter.keys())
    return np.log(N/len(docs_w_term))

tf_idf = lambda term, paper, cluster: term_freq(term, paper) * inv_doc_freq(term, cluster)

def paper_tfidf(paper_:Counter, cluster_: dict):
    terms_tfidf = Counter()
    for term in paper_:
        terms_tfidf[term] = tf_idf(term, paper_, cluster_)
    return terms_tfidf

In [133]:
cluster0_keys = TF_papers[0].keys()

print(list(cluster0_keys)[0])
print(list(cluster0_keys)[1])
print(list(cluster0_keys)[2])

cluster_dummy = {
    '01f02fae-97df-4207-a386-a1bc8ec0853b':TF_papers[0]['01f02fae-97df-4207-a386-a1bc8ec0853b'],
    '076bca9b-b30c-4c2c-8148-3b9b5d9aa939':TF_papers[0]['076bca9b-b30c-4c2c-8148-3b9b5d9aa939']
}

cluster = TF_papers[0]
TFIDFs = dict.fromkeys(cluster.keys())
for p_id, p_counter in cluster.items():
    TFIDFs[p_id] = paper_tfidf(p_counter, cluster)
    
c0_df = DataFrame(columns=list(TF_clusters[0].keys()))

01f02fae-97df-4207-a386-a1bc8ec0853b
076bca9b-b30c-4c2c-8148-3b9b5d9aa939


In [None]:
def idf_papers(word,papers):
    occ = sum(1 for paper in papers.values() if word in paper)
    return np.log(10 / occ)

def tf_idf_papers(cluster, paper_id, counter): 
    print(paper_id)
    total_words = len(counter)
    return {word: (counter[word] / total_words) * idf_papers(word, TF_papers[cluster]) for word in counter}

In [None]:
new_df = True

word_count_papers = lambda word: sum(1 for i in papers if word in papers[i])

def tf_idf_papers(paper_id, counter): 
    print(paper_id)
    
    # Total words in the current paper
    total_words = counter.total()
       
    return {word: (counter[word] / total_words) * np.log( N / word_count_papers(word) ) for word in counter}

if new_df:
    tf_idf_all_papers = {}
    for cluster_id, papers in tqdm(TF_papers.items(), desc="Processing clusters"):
        
        # Number of papers inside the currect cluster 
        N = len(TF_papers[cluster_id])
        
        tf_idf_all_papers[cluster_id] = [tf_idf_papers(paper_id_, counter_) for paper_id_, counter_ in papers.items()]
    
    pickle.dump(tf_idf_all_papers, open( "tf_idf_all_papers.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_papers.pkl", 'rb') as f:
        tf_idf_all_papers = pickle.load(f) 

In [55]:
def tf_idf(cluster_id, counter):  
    # Total number of words in the cluster
    total_words = counter.total()
    
    # Calculate TF-IDF for each word and sort the results
    tf_idf_values = {word: (TF_clusters[cluster_id][word] / total_words) * np.log( N / word_count(word) ) for word in counter}
    sorted_tf_idf_values = sorted(tf_idf_values.items(), key=lambda x: x[1], reverse=True)
    
    # Return the top 40 items
    return sorted_tf_idf_values[:40]

if new_df:

    # Number of clusters
    N = len(TF_clusters)
    
    tf_idf_all_communities = {i: tf_idf(i, TF_clusters[i]) for i in TF_clusters}
    
    print('Pickling...')
    pickle.dump(tf_idf_all_communities, open( "tf_idf_all_communities.pkl", "wb" ) )
    
else:
    with open("tf_idf_all_communities.pkl", 'rb') as f:
        tf_idf_all_communities = pickle.load(f)

Pickling...
