In [9]:
import os
import numpy as np
import pandas as pd
import netwulf as nw
import networkx.algorithms.community as nx_comm

from ast import literal_eval
from tqdm import tqdm
from IPython.display import clear_output

# Fixing the path:
if os.getcwd()[-1] in '0123456798':
    path_parent = os.path.dirname(os.getcwd())
    os.chdir(path_parent)

In [10]:
stylized_network, config, G = nw.load("BigData/Graph.json")

# Finding the louvain communities of our Graph:
set_seed = 42
communities = nx_comm.louvain_communities(G, seed=set_seed)


# Importing the paper_dataset from a few weeks back:
paper_dataset = pd.read_csv("BigData/paper_dataset.csv", converters={"authorIds": literal_eval, "authorNames": literal_eval})
# Converting the paper_dataset into an authorId lookup by exploding and then setting a new index:
author_lookup = paper_dataset.explode(["authorIds", "authorNames"]).set_index("authorIds").rename(columns={'authorIds': 'authorId'}).dropna()
# Removing the incorrectly exploded authorNames:
author_lookup = author_lookup[author_lookup.index.notnull()]

# Importing the tokenized paper abstracts:
paper_tokens_lookup = pd.read_csv("BigData/paper_tokens_dataset.csv", index_col=0, converters={"Tokens": literal_eval})

# TF-IDF

### What does TF stand for?
TF stands for Term Frequency. It is simply the frequency of a specific term appearing in a document. Therefore the TF is the number of times a word appears in a document, divided by the total number of words in the document.

### What does IDF stand for?
IDF stands for Inverse Document Frequency. The purpose of IDF is to emphasize rarer words, and avoiding the common words like "the" and "and" etc. The IDF is the log of the number of documents divided by the number of documents that contain the word. The more documents that contain a word, the lower the word's IDF, and the less important that word is.

In [11]:
# Defining global counters to keep track of missing authors and papers:
missing_authors = 0
missing_papers = 0

# Get a list of all the papers authored by the authors in a community:
def get_papers_to_check(community: list) -> list:
    global missing_authors
    papers_to_check = []
    print("Getting papers from authorId...")
    for author_id in tqdm(community):
        try: papers_to_check.extend(author_lookup.loc[str(author_id)]["paperId"].tolist())
        except: missing_authors += 1
    return papers_to_check

# Get a list of all the tokens from each paper in a list:
def get_tokens_from_papers(papers_to_check: list) -> list:
    global missing_papers
    total_tokens = []
    print("Getting tokens from papers...")
    for paper_id in tqdm(papers_to_check):
        try: total_tokens.extend(paper_tokens_lookup.loc[paper_id]["Tokens"])
        except: missing_papers += 1
    return total_tokens

# Get the total tokens for each community in the list:
def generate_token_groups(communities):
    paper_groups = []
    token_groups = []
    for idx, community in enumerate(communities):
        print(f"Generating token groups... {idx + 1} of {len(communities)}\n")
        papers_to_check = get_papers_to_check(list(community))
        paper_groups.extend(papers_to_check)
        token_groups.extend(get_tokens_from_papers(papers_to_check))
        clear_output()
    print(f"All {len(communities)} communities successfully generated.")
    return token_groups, paper_groups

In [126]:
# It takes way too much time to genereate all the ~600 token groups, so I opted to just generate the top 10 and ignore the rest.
# Finding the top 10 communities by author count:
sorted_args = np.array([len(community) for community in communities]).argsort()[::-1]
top_communities = [communities[i] for i in sorted_args[:10]]

print("Top 10 communities by author count:", [(len(communities[i])) for i in sorted_args][:10])

Top 10 communities by author count: [2594, 2469, 1819, 1574, 1326, 1026, 1006, 1005, 981, 977]


In [59]:
# Generating token groups for each of the top communities:
top_communities_tokens, top_communities_papers = generate_token_groups(top_communities)

All 10 successfully generated.


In [None]:
# Reformatting the top_communities_tokens and saving it:

# Zips the all unique tokens with their # of occurences in the list:
zipped_unique = lambda lister: zip(*np.unique(lister, return_counts=True))
# Sorts the zipped list by the second element in descending order:
sorter = lambda zipper: sorted(zipper, key=lambda x: x[1], reverse=True)
# Making the sorted dicts for each top community:
top_communities_tokens_uniquecount = [dict(sorter(zipped_unique(i))) for i in top_communities_tokens]

# Making it into a dataframe:
tokens_df = pd.DataFrame({"Communities": top_communities, "PaperIds": top_communities_papers, "Tokens": top_communities_tokens_uniquecount})
# tokens_df.to_csv("BigData/top_community_tokens.csv")

In [12]:
tokens_df = pd.read_csv("BigData/top_community_tokens.csv", index_col=0, converters={"Tokens": literal_eval})
tokens_df

Unnamed: 0,Communities,Tokens
0,"{1409253380, 8085509, 12926984, 115777544, 308...","{'patients': 5145, 'data': 4470, 'health': 401..."
1,"{118390786, 35053571, 1867785, 2079686669, 281...","{'data': 5550, 'model': 3909, 'results': 3103,..."
2,"{12759041, 51884035, 1732611, 1637421061, 2116...","{'data': 7595, 'information': 3213, 'paper': 3..."
3,"{103301121, 22532100, 2166793, 3182604, 939745...","{'models': 9180, 'model': 8473, 'tasks': 7899,..."
4,"{6590464, 39137288, 79101961, 11493385, 153317...","{'patients': 341, 'usa': 290, 'ffr': 177, 'cor..."
5,"{103362561, 104681475, 113463301, 25157637, 15...","{'gamma-ray': 3693, 'emission': 3463, 'data': ..."
6,"{2090979328, 46567426, 66150403, 2174097413, 1...","{'workshop': 1272, 'intelligence': 1268, 'rese..."
7,"{2153465857, 32350210, 13398019, 3948545, 5103...","{'model': 1617, 'data': 1364, 'models': 1229, ..."
8,"{1484736514, 3121155, 4771844, 104284168, 2068...","{'emotions': 83466, 'interventions': 56043, 'r..."
9,"{145199105, 1761281, 48494598, 51390475, 49647...","{'research': 10436, 'health': 10034, 'original..."


In [13]:
[get_papers_to_check(list(community)) for community in tokens_df["Communities"]]

Getting papers from authorId...


  1%|          | 223/26831 [00:20<35:08, 12.62it/s]

### Describe similarities and differences between the communities:
There are some generic similarities between the communities, all of them have the "data" token in their top most frequent, similarly "model", "models", "patients", "research", "paper", etc. are also words that come up a lot and are all clearly words generally related to the topic of scientific research. But in the top 3 there are also some differences. Community 5 mentions "gamma-ray" and "emission", while community 6 
mentions "workshop" and "intelligence", and community 8 focuses on "emotions" and "interventions".

In [8]:
# dict([(item[0], item[1] / sum(list(tokens_df["Tokens"][0].values())) * 100) for item in tokens_df["Tokens"][0].items()])

### Why aren't the TFs not necessarily a good description of the communities?
As I mentionend when describing the similarities, there are a lot of generic words related to the topic of scientific research that you're basically expected to use in your abstract. So looking purely at the frequency of words doesn't tell you much.

### Next, we calculate IDF for every word.


### What base logarithm did you use? Is that important?

### We're ready to calculate TF-IDF. Do that for the top 9 communities (by number of authors). Then for each community:

In [None]:
display("These are the top 9 communities:")


display("These are the top 10 TF words:")

display("These are the top 10 TF-IDF words:")

display("These are the top 3 authors (by degree):")


### Are these 10 words more descriptive of the community? If yes, what is it about IDF that makes the words more informative?
