In [43]:
import os
import numpy as np
import pandas as pd
import netwulf as nw
import networkx.algorithms.community as nx_comm

from collections import Counter
from ast import literal_eval
from tqdm import tqdm
from IPython.display import clear_output

# Fixing the path:
if os.getcwd()[-1] in '0123456798':
    path_parent = os.path.dirname(os.getcwd())
    os.chdir(path_parent)

In [44]:
stylized_network, config, G = nw.load("BigData/Graph.json")

# Finding the louvain communities of our Graph:
set_seed = 42
communities = nx_comm.louvain_communities(G, seed=set_seed)


# Importing the paper_dataset from a few weeks back:
paper_dataset = pd.read_csv("BigData/paper_dataset.csv", converters={"authorIds": literal_eval, "authorNames": literal_eval})
# Converting the paper_dataset into an authorId lookup by exploding and then setting a new index:
author_lookup = paper_dataset.explode(["authorIds", "authorNames"]).set_index("authorIds").rename(columns={'authorIds': 'authorId'}).dropna()
# Removing the incorrectly exploded authorNames:
author_lookup = author_lookup[author_lookup.index.notnull()]

# Importing the tokenized paper abstracts:
paper_tokens_lookup = pd.read_csv("BigData/paper_tokens_dataset.csv", index_col=0, converters={"Tokens": literal_eval})

KeyboardInterrupt: 

# TF-IDF

### What does TF stand for?
TF stands for Term Frequency. It is simply the frequency of a specific term appearing in a document. Therefore the TF is the number of times a word appears in a document, divided by the total number of words in the document.

### What does IDF stand for?
IDF stands for Inverse Document Frequency. The purpose of IDF is to emphasize rarer words, and avoiding the common words like "the" and "and" etc. The IDF is the log of the number of documents divided by the number of documents that contain the word. The more documents that contain a word, the lower the word's IDF, and the less important that word is.

In [None]:
# Defining global counters to keep track of missing authors and papers:
missing_authors = 0
missing_papers = 0

# Get a list of all the papers authored by the authors in a community:
def get_papers_to_check(community: list) -> list:
    global missing_authors
    papers_to_check = []
    print("Getting papers from authorId...")
    for author_id in tqdm(community):
        try: papers_to_check.extend(author_lookup.loc[str(author_id)]["paperId"].tolist())
        except: missing_authors += 1
    return papers_to_check

# Get a list of all the tokens from each paper in a list:
def get_tokens_from_papers(papers_to_check: list) -> list:
    global missing_papers
    total_tokens = []
    print("Getting tokens from papers...")
    for paper_id in tqdm(papers_to_check):
        try: total_tokens.extend(paper_tokens_lookup.loc[paper_id]["Tokens"])
        except: missing_papers += 1
    return total_tokens

# Get the total tokens for each community in the list:
def generate_token_groups(communities):
    paper_groups = []
    token_groups = []
    for idx, community in enumerate(communities):
        print(f"Generating token groups... {idx + 1} of {len(communities)}\n")
        papers_to_check = get_papers_to_check(list(community))
        paper_groups.extend(papers_to_check)
        token_groups.extend(get_tokens_from_papers(papers_to_check))
        clear_output()
    print(f"All {len(communities)} communities successfully generated.")
    return token_groups, paper_groups

In [None]:
# It takes way too much time to genereate all the ~600 token groups, so I opted to just generate the top 10 and ignore the rest.
# Finding the top 10 communities by author count:
sorted_args = np.array([len(community) for community in communities]).argsort()[::-1]
top_communities = [communities[i] for i in sorted_args[:10]]

print("Top 10 communities by author count:", [(len(communities[i])) for i in sorted_args][:10])

Top 10 communities by author count: [2594, 2469, 1819, 1574, 1326, 1026, 1006, 1005, 981, 977]


In [None]:
# Generating token groups for each of the top communities:
top_communities_tokens, top_communities_papers = generate_token_groups(top_communities)

All 10 successfully generated.


In [None]:
# Reformatting the top_communities_tokens and saving it:

# Zips the all unique tokens with their # of occurences in the list:
zipped_unique = lambda lister: zip(*np.unique(lister, return_counts=True))
# Sorts the zipped list by the second element in descending order:
sorter = lambda zipper: sorted(zipper, key=lambda x: x[1], reverse=True)
# Making the sorted dicts for each top community:
top_communities_tokens_uniquecount = [dict(sorter(zipped_unique(i))) for i in top_communities_tokens]

# Making it into a dataframe:
tokens_df = pd.DataFrame({"Communities": top_communities, "PaperIds": top_communities_papers, "Tokens": top_communities_tokens_uniquecount})
# tokens_df.to_csv("BigData/top_community_tokens.csv")

In [None]:
tokens_df = pd.read_csv("BigData/top_community_tokens.csv", index_col=0, converters={"Communities": literal_eval, "PaperIds": literal_eval, "Tokens": literal_eval})
tokens_df

Unnamed: 0,Communities,PaperIds,Tokens
0,"{1409253380, 8085509, 12926984, 115777544, 308...","[19c03d3a03dd21e1dd74c3cd9ca57825d7440d88, b54...","{'patients': 5145, 'data': 4470, 'health': 401..."
1,"{118390786, 35053571, 1867785, 2079686669, 281...","[3528385c6eef96422b4cf7d3a7f87ef59ea12ac1, cc3...","{'data': 5550, 'model': 3909, 'results': 3103,..."
2,"{2252806, 49479696, 1679379, 2809876, 35332118...","[6f80d1ade43ae048763d65c6e8e913d9a31de4be, 8f9...","{'data': 7595, 'information': 3213, 'paper': 3..."
3,"{103301121, 2555924, 2117787677, 46702624, 170...","[34503c0b6a615124eaf82cb0e4a1dab2866e8980, 094...","{'models': 9180, 'model': 8473, 'tasks': 7899,..."
4,"{79101961, 11493385, 2301965, 88317978, 144179...","[c117553b2eac5d02eaac3c9bc33a44fe2e1c3ca7, 857...","{'patients': 341, 'usa': 290, 'ffr': 177, 'cor..."
5,"{103362561, 104681475, 144484357, 113463301, 2...","[dd491b812f8acfec86f855e1cac8ed72ca062b53, 050...","{'gamma-ray': 3693, 'emission': 3463, 'data': ..."
6,"{2090979328, 46567426, 66150403, 2174097413, 1...","[fbfc15492c8e114f2884d7cc11ba21f7f350285f, eca...","{'workshop': 1272, 'intelligence': 1268, 'rese..."
7,"{2153465857, 32350210, 13398019, 3948545, 5103...","[14692c6785ce842eae91eb7cf6fffb0c7bbb805f, 868...","{'model': 1617, 'data': 1364, 'models': 1229, ..."
8,"{1484736514, 3121155, 4771844, 104284168, 2068...","[c7293f9dd2ec3d34edf5331eb5a6ad7614723197, 2ff...","{'emotions': 83466, 'interventions': 56043, 'r..."
9,"{145199105, 1761281, 48494598, 51390475, 49647...","[b13799435551d4f2b45f46ebb59e481baf42b11c, e8f...","{'research': 10436, 'health': 10034, 'original..."


### Describe similarities and differences between the communities:
There are some generic similarities between the communities, all of them have the "data" token in their top most frequent, similarly "model", "models", "patients", "research", "paper", etc. are also words that come up a lot and are all clearly words generally related to the topic of scientific research. But in the top 3 there are also some differences. Community 5 mentions "gamma-ray" and "emission", while community 6 
mentions "workshop" and "intelligence", and community 8 focuses on "emotions" and "interventions".

In [None]:
# TF formula:
TF = lambda term_num, N: (term_num / N) * 100

communities_TFs = []
for token_group in tqdm(tokens_df['Tokens']):
    counts = Counter(token_group)
    TF_dict = dict([(token, TF(count, sum(counts.values()))) for token, count in counts.items()])
    communities_TFs.append(TF_dict)

# Just a little teaser:
print("first 5 in first community:")
list(communities_TFs[0].items())[:5]

100%|██████████| 10/10 [00:37<00:00,  3.73s/it]

first 5 in first community:





[('patients', 0.5563587643536777),
 ('data', 0.48336708973001735),
 ('health', 0.4341652942429574),
 ('results', 0.378475349900461),
 ('model', 0.3625793852046416)]

### Why aren't the TFs not necessarily a good description of the communities?
As mentionend when describing the similarities, there are a lot of generic words related to the topic of scientific research that are to be expected in an abstract. So looking purely at the frequency of words doesn't tell you much.

### Next, we calculate IDF for every word.

In [None]:
# IDF formula:
IDF = lambda term_occurences, N: np.log10(N / term_occurences)

communities_IDFs = []
for i in tqdm(range(len(tokens_df['Tokens']))):
    counts, papers = Counter(tokens_df['Tokens'][i]), tokens_df["PaperIds"][i]
    IDF_dict = dict([(token, IDF(count, len(papers))) for token, count in counts.items()])
    communities_IDFs.append(IDF_dict)
    
# Just a little teaser:
print("first 5 in first community:")
list(communities_IDFs[0].items())[:5]

100%|██████████| 10/10 [00:00<00:00, 18.82it/s]

first 5 in first community:





[('patients', 0.20131482909240864),
 ('data', 0.2623926850589239),
 ('health', 0.3090146585761606),
 ('results', 0.3686321638405847),
 ('model', 0.38726665476204025)]

### What base logarithm did you use to calculate the IDF? Is that important?
The base of the logarithm is important. Since how the IDF "dampens" depends on the logarithm base, i.e. a smaller base would dampen the frequent terms more, while a bigger base could be preffered if you want to give more weight to rare terms. I chose to go with np.log10 which is the logartithm with base 10, because it felt appropriate for this case, making generic terms less apparent.

### We're ready to calculate TF-IDF. Do that for the top 9 communities (by number of authors). Then for each community:

In [None]:
communities_TF_IDFs = []
for idx, token_group in enumerate([list(i) for i in tokens_df["Tokens"]]):
    group_TF_IDF = []
    for token in token_group:
        # An error in some cases causes negative IDF values, so I just set them to 0.
        group_TF_IDF.append(communities_TFs[idx][token] * max(0,communities_IDFs[idx][token]))
    communities_TF_IDFs.append(dict(sorter(zip(token_group, group_TF_IDF))))
    
# Just a little teaser:
print("first 5 in first community:")
list(communities_TF_IDFs[0].items())[:5]

first 5 in first community:


[('models', 0.14113037359939387),
 ('research', 0.14106781219015055),
 ('study', 0.14102358371643073),
 ('mortality', 0.1409867230188593),
 ('methods', 0.14058421302837504)]

In [46]:
# Since the last round of neighbors was foud without getting their names and such info, I can
# only display the author ids:
def print_community(community_idx):
    print(f"COMMUNITY {community_idx + 1}:\n")
    print("Top 10 TF terms:")
    print(list(communities_TFs[community_idx])[:10])
    print("Top 10 TF-IDF terms:")
    print(list(communities_TF_IDFs[community_idx])[:10])
    print("Top 3 authors:")
    print(list(tokens_df["Communities"][community_idx])[:3])
    print()

In [47]:
print_community(0)
print_community(1)
print_community(2)

COMMUNITY 1:

Top 10 TF terms:
['patients', 'data', 'health', 'results', 'model', 'study', 'models', 'research', 'mortality', 'methods']
Top 10 TF-IDF terms:
['models', 'research', 'study', 'mortality', 'methods', 'model', 'social', 'using', 'results', 'use']
Top 3 authors:
[1409253380, 8085509, 12926984]

COMMUNITY 2:

Top 10 TF terms:
['data', 'model', 'results', 'paper', 'children', 'social', 'study', 'systems', 'system', 'health']
Top 10 TF-IDF terms:
['results', 'model', 'paper', 'children', 'social', 'study', 'systems', 'system', 'health', 'different']
Top 3 authors:
[118390786, 35053571, 1867785]

COMMUNITY 3:

Top 10 TF terms:
['data', 'information', 'paper', 'model', 'results', 'system', 'social', 'performance', 'problem', 'using']
Top 10 TF-IDF terms:
['results', 'system', 'social', 'performance', 'problem', 'using', 'model', 'study', 'models', 'paper']
Top 3 authors:
[2252806, 49479696, 1679379]



In [48]:
print_community(4)
print_community(5)
print_community(6)

COMMUNITY 5:

Top 10 TF terms:
['patients', 'usa', 'ffr', 'coronary', 'lesions', 'p', 'uk', 'risk', 'clinical', 'group']
Top 10 TF-IDF terms:
['coronary', 'lesions', 'p', 'uk', 'ffr', 'risk', 'clinical', 'group', 'disease', 'bifurcation']
Top 3 authors:
[79101961, 11493385, 2301965]

COMMUNITY 6:

Top 10 TF terms:
['gamma-ray', 'emission', 'data', 'telescope', 'fermi', 'energy', 'large', 'γ-ray', 'gev', 'flux']
Top 10 TF-IDF terms:
['information', 'network', 'high', 'different', 'x-ray', 'paper', 'p.', 'j.', 'distribution', 'present']
Top 3 authors:
[103362561, 104681475, 144484357]

COMMUNITY 7:

Top 10 TF terms:
['workshop', 'intelligence', 'research', 'artificial', 'new', 'learning', 'model', 'systems', 'language', 'workshops']
Top 10 TF-IDF terms:
['data', 'pp', 'reviewed', 'tasks', 'models', 'ai', 'games', 'us', 'safety', 'analysis']
Top 3 authors:
[2090979328, 46567426, 66150403]



In [49]:
print_community(7)
print_community(8)

COMMUNITY 8:

Top 10 TF terms:
['model', 'data', 'models', 'results', 'new', 'social', 'tasks', 'language', 'using', 'cancer']
Top 10 TF-IDF terms:
['original', 'across', 'replication', 'scale', 'development', 'systems', 'research', 'present', 'big-bench', 'performance']
Top 3 authors:
[2153465857, 32350210, 13398019]

COMMUNITY 9:

Top 10 TF terms:
['emotions', 'interventions', 'reappraisal', 'one', 'positive', 'negative', 'effects', 'original', 'effect', 'pandemic']
Top 10 TF-IDF terms:
['low', 'lab', 'among', 'important', 'greater', 'engagement', 'direct', 'extremely', 'showed', 'future']
Top 3 authors:
[1484736514, 3121155, 4771844]



### Are these 10 words more descriptive of the community? If yes, what is it about IDF that makes the words more informative?
In some cases they are, for example the word "data" has in all cases gone a few places down the list, but community 6 it has unfortunately lost some of its uniqueness. But I'd still argue that the TF-IDF has helped dampen these most common/generic words, and the new top 10 is a little more informative than the TF top 10.
