In [None]:
import pandas as pd
import numpy as np
import json
from numpy import linalg as LA
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.cluster import AgglomerativeClustering

import random
from datetime import datetime, timedelta
from multiprocessing import Pool

import itertools
from itertools import groupby
from operator import itemgetter
import os
import re
import sys
import pickle
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import axes3d

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, find
import sparse_dot_topn.sparse_dot_topn as ct
from sparse_dot_topn import awesome_cossim_topn
from gensim.models.ldamodel import LdaModel
from collections import Counter


from AToMS.atoms_gdltm_utils import *
from AToMS.author_disambiguation import *

np.set_printoptions(suppress=True)

%matplotlib inline

In [None]:
import xml.etree.ElementTree as Xet

import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import gensim.corpora as corpora

import pyLDAvis.gensim_models
import pyLDAvis

nltk.download('stopwords')

In [None]:
dset = 'arxiv'
dsub = 'ml_special' # format here is 'oai_cs_CL' for cs.CL category, or 'oai_multi', 'ml_special' for hand-selected subjects, or just 'oai_all' for no filter
MAX_AUTHORS_PER_DOC = 20

VOCAB_HIGH_FREQ_THRESH_FRAC = 0.995 # If more than X fraction of documents contain a word, remove the word
VOCAB_LOW_FREQ_THRESH = 1 # If no more than X docs contain a word, remove it

# For filtering terms by relevancy
lambda_param = 0.6
top_n = 2000
num_topics = 50

src_dir = '../data/'+dset+'/original/'
int_dir = '../data/'+dset+'/intermediate/'+dsub+'/'
fig_dir = '../experiments/'+dset+'/'+dsub+'/'

In [None]:
# Create directories if they don't exist
if not os.path.exists(src_dir):
    os.makedirs(src_dir)
if not os.path.exists(int_dir):
    os.makedirs(int_dir)
if not os.path.exists(fig_dir):
    os.makedirs(fig_dir)

In [None]:
# Retrieve all .pkl documents from the src_dir
src_json = src_dir + 'arxiv-metadata-oai-snapshot.json'

if dsub == 'oai_multi':
    #cat_lbl = ['cs.CV', 'hep-th', 'q-bio', 'econ', 'math.AG', 'astro-ph']
    #cat_lbl = ['cs.DS', 'physics.flu-dyn', 'math.GT', 'cond-mat.quant-gas']
    cat_lbl = [
        'cs.CR',  # Cryptography and Security
        'math.DG',  # Differential Geometry
        'math.PR',  # Probability
        'physics.flu-dyn',  # Fluid Dynamics
    ]
elif dsub == 'ml_special':
    cat_lbl = [
        'stat.AP',
        'stat.CO',
        'stat.ME',
        'stat.OT',
        'stat.TH',
        'cs.LG',
    ]
elif dsub != 'oai_all':
    cat_lbl = [dsub.split('_')[1] + '.' + dsub.split('_')[2]]
else: # dsub == 'oai_all'
     cat_lbl = [
        'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR',
        'cond-mat.dis-nn', 'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cond-mat.other', 'cond-mat.quant-gas',
        'cond-mat.soft', 'cond-mat.stat-mech', 'cond-mat.str-el', 'cond-mat.supr-con',
        'cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB', 'cs.DC',
        'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT',
        'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH', 'cs.OS', 'cs.PF',
        'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY',
        'econ.EM', 'econ.GN', 'econ.TH',
        'eess.AS', 'eess.IV', 'eess.SP', 'eess.SY',
        'gr-qc',
        'hep-ex', 'hep-lat', 'hep-ph', 'hep-th',
        'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA', 'math.CO', 'math.CT', 'math.CV', 'math.DG',
        'math.DS', 'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT', 'math.HO', 'math.IT', 'math.KT',
        'math.LO', 'math.MG', 'math.MP', 'math.NA', 'math.NT', 'math.OA', 'math.OC', 'math.PR', 'math.QA',
        'math.RA', 'math.RT', 'math.SG', 'math.SP', 'math.ST',
        'nlin.AO', 'nlin.CD', 'nlin.CG', 'nlin.PS', 'nlin.SI',
        'nucl-ex', 'nucl-th',
        'physics.acc-ph', 'physics.ao-ph', 'physics.app-ph', 'physics.atm-clus', 'physics.atom-ph',
        'physics.bio-ph', 'physics.chem-ph', 'physics.class-ph', 'physics.comp-ph', 'physics.data-an',
        'physics.ed-ph', 'physics.flu-dyn', 'physics.gen-ph', 'physics.geo-ph', 'physics.hist-ph',
        'physics.ins-det', 'physics.med-ph', 'physics.optics', 'physics.plasm-ph', 'physics.pop-ph',
        'physics.soc-ph', 'physics.space-ph',
        'q-bio.BM', 'q-bio.CB', 'q-bio.GN', 'q-bio.MN', 'q-bio.NC', 'q-bio.OT', 'q-bio.PE', 'q-bio.QM',
        'q-bio.SC', 'q-bio.TO',
        'q-fin.CP', 'q-fin.EC', 'q-fin.GN', 'q-fin.MF', 'q-fin.PM', 'q-fin.PR', 'q-fin.RM', 'q-fin.ST',
        'q-fin.TR',
        'quant-ph',
        'stat.AP', 'stat.CO', 'stat.ME', 'stat.ML', 'stat.OT', 'stat.TH'
    ]



arxiv_data = pd.read_json(src_json, lines=True)
arxiv_data['date'] = arxiv_data['update_date']

# Filter and collate data

In [None]:
arxiv_data['date'] = arxiv_data['update_date']

In [None]:
# Sort/filter docs
arxiv_data = arxiv_data.sort_values(by='date')

# Drop columns that aren't of interest
arxiv_data = arxiv_data[['id', 'title', 'categories', 'abstract', 'date', 'authors_parsed']]
arxiv_data = arxiv_data.dropna()
arxiv_data = arxiv_data.reset_index(drop=True)

In [None]:
# Create authors column and standardize format
arxiv_data['authors'] = arxiv_data['authors_parsed'].map(lambda authlist: [x[0]+', '+x[1] for x in authlist])
arxiv_data['authors'] = arxiv_data['authors'].map(lambda authlist: [str.upper(str.strip(auth_i)) for auth_i in authlist])

In [None]:
# Filter by category label
cat_lbl_mask = arxiv_data['categories'].apply(lambda x: any(f in x for f in cat_lbl))
arxiv_data = arxiv_data[cat_lbl_mask]

arxiv_data

In [None]:
# Simple preprocess removes punctuation and returns lowercase words as list, in place of full strings
arxiv_data['text_processed'] = arxiv_data['abstract'].map(lambda x: super_simple_preprocess(x))

In [None]:
data_words = list(arxiv_data['text_processed'])
id2word = corpora.Dictionary(data_words) # Create dictionary of words, keyed by word ID
vlen1 = len(id2word)
print("Original vocab length: " + str(vlen1))

In [None]:
with Pool() as pool:
    data_words = pool.map(lemmatize_mp, data_words)

In [None]:
id2word = corpora.Dictionary(data_words)
vlen2 = len(id2word)
print("Vocab length after lemmatization: " + str(vlen2))

In [None]:
# Remove rare and common words to shrink the vocab (improves topic model)
id2docfreq = count_doc_freq_of_words(data_words, id2word)
ndocs = len(data_words)
sw_extension = ['system', 'word', 'method', 'text', 'approach', 'based', 'case', 'model', 'well']
for wordId, docfreq in id2docfreq.items():
    if docfreq > VOCAB_HIGH_FREQ_THRESH_FRAC * ndocs:
        sw_extension.append(id2word[wordId])
    if docfreq <= VOCAB_LOW_FREQ_THRESH:
        sw_extension.append(id2word[wordId])
print("Cutting {} words from vocab due to being extremely rare or common...".format(len(sw_extension)))

# Use multiprocessing to speed up the next section
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'sub', 'sup', 'used', 'using'])
stop_words.extend(sw_extension)
stop_words = set(stop_words)
# Create approved words list
all_tokens = set(id2word.token2id.keys())
approved_tokens = sorted([token for token in all_tokens.difference(stop_words) if len(token) > 2])

In [None]:
# mp_iterable is an iterable list for the arguments to be passed to the multi-processes
mp_iterable = [(data_words[i], approved_tokens) for i in range(len(data_words))]
with Pool() as pool:
    data_words = pool.starmap(filter_stopwords_by_approval, mp_iterable)

In [None]:
id2word = corpora.Dictionary(data_words)
vlen2 = len(id2word)
print("Vocab length after removing stopwords: " + str(vlen2))

In [None]:
# Step 1: Prepare the corpus and train LDA model
corpus = [id2word.doc2bow(text) for text in arxiv_data['text_processed']]
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=1)

# Step 2: Get the topic-term matrix
topic_term_matrix = lda_model.get_topics()

# Convert topic-term matrix into a more accessible format (dictionary of terms and their probabilities)
topic_term_dict = {}
for topic_id, topic_dist in enumerate(topic_term_matrix):
    # Get sorted term indices for each topic
    top_term_indices = np.argsort(topic_dist)[::-1]  # Sort terms by their probability in descending order
    # Convert term IDs to words and store their probabilities
    top_terms = [(id2word[term_id], topic_dist[term_id]) for term_id in top_term_indices]
    topic_term_dict[topic_id] = top_terms

# Step 3: Calculate overall term frequencies (P(w)) across the corpus
all_terms = [term for doc_terms in arxiv_data['text_processed'] for term in doc_terms]
term_counts = Counter(all_terms)

total_terms_in_corpus = sum(term_counts.values())
p_w = {term: count / total_terms_in_corpus for term, count in term_counts.items()}

# Step 4: Calculate relevancy scores using log odds for each term in each topic
epsilon = 1e-12  # Small constant to avoid log(0)
relevancy_scores = {}

for topic_id, term_probs in topic_term_dict.items():
    for term, p_w_given_t in term_probs:  # term_probs is a list, so we iterate directly
        # Calculate relevancy score using log odds
        if term in p_w:
            # Smooth probabilities to avoid log(0)
            p_w_given_t = max(p_w_given_t, epsilon)
            p_w_smoothed = max(p_w[term], epsilon)

            # Log odds computation
            log_p_w_given_t = np.log(p_w_given_t)
            log_p_w = np.log(p_w_smoothed)
            log_odds = log_p_w_given_t - log_p_w

            # Combine log terms with lambda_param
            relevancy = lambda_param * log_p_w_given_t + (1 - lambda_param) * log_odds
            relevancy_scores[(term, topic_id)] = relevancy

# Step 5: Get the top relevant terms for each topic
top_terms_by_topic = {}

for topic_id in range(num_topics):
    # Filter relevancy scores for the current topic
    topic_terms = {term: relevancy for (term, t_id), relevancy in relevancy_scores.items() if t_id == topic_id}

    # Get the top N terms for the topic
    top_terms = sorted(topic_terms, key=topic_terms.get, reverse=True)[:top_n]

    # Store top terms for this topic
    top_terms_by_topic[topic_id] = top_terms

# Step 6: Reduce the vocab to the top terms across all topics
reduced_vocabulary = set()
for terms in top_terms_by_topic.values():
    reduced_vocabulary.update(terms)

# Step 7: Create a new filtered dictionary and corpus
new_id2word = corpora.Dictionary()  # Create a new dictionary for filtered vocabulary
new_id2word.add_documents([[term] for term in reduced_vocabulary])  # Add only terms from reduced vocabulary

# Recreate the corpus using the new filtered dictionary
filtered_corpus = []
for doc in arxiv_data['text_processed']:
    # Create BoW for the document using the filtered dictionary
    bow = new_id2word.doc2bow([term for term in doc if term in new_id2word.token2id])
    filtered_corpus.append(bow)

In [None]:
arxiv_data['text_processed'] = arxiv_data['text_processed'].apply(lambda doc: filter_terms(doc, reduced_vocabulary))

In [None]:
# Update text_processed to remove any words not in dictionary from the main dataframe
#arxiv_data['text_processed'] = data_words
# Drop rows with no text after processing
arxiv_data = arxiv_data[arxiv_data['text_processed'].apply(lambda x: len(x) > 0)]

# Drop null data again in case we cut out an entire row somehow
arxiv_data = arxiv_data.dropna()
arxiv_data = arxiv_data.reset_index(drop=True)

In [None]:
mem_use = arxiv_data.memory_usage(deep=True)
print("arxiv_data size: {:,} bytes".format(sum(mem_use)))

In [None]:
# Save in pickle file
arxiv_data.to_pickle(int_dir + 'collated_only.pkl')

# Author disambiguation

In [None]:
# Create list of all authors
all_authors = []
author_to_arxivId = []
for index, row in arxiv_data.iterrows():

    authors = row['authors']

    # If there are too many authors, scrap the document as it is not meaningful
    if len(authors) > MAX_AUTHORS_PER_DOC:
        arxiv_data = arxiv_data.drop(index)

    arxiv_id = row['id']
    for author in authors:
        # Only add the author if their name starts with a letter (to prevent junk)
        if re.search("^[a-zA-Z]", author) is not None:
            author_to_arxivId.append(arxiv_id)
            all_authors.append(author)

# sort all_authors alphabetically along with the map to osti IDs
all_authors, author_to_arxivId = (list(t) for t in zip(*sorted(zip(all_authors, author_to_arxivId))))

# Upper case all author names and use first 20 characters only
all_authors = list(map(str.upper, all_authors))
all_authors = [name[0:20].strip() for name in all_authors]

with open(int_dir + 'all_authors.pkl', 'wb') as f:
    pickle.dump(all_authors, f)
with open(int_dir + 'author_to_arxivId.pkl', 'wb') as f:
    pickle.dump(author_to_arxivId, f)

In [None]:
# Reduce list to set of unique author names
authors_collapsed = set(all_authors)
authors_collapsed = list(authors_collapsed)
authors_collapsed.sort()
ceil_nauth = len(authors_collapsed)

print("Number of authors prior to disambiguation = " + str(ceil_nauth))

In [None]:
dflen = len(arxiv_data)
print("Number of docs in corpus: " + str(dflen))

arxiv_data

In [None]:
max_sz_wfe_single = ceil_nauth * vlen1 * 4 / 1000000000
max_sz_wfe_double = ceil_nauth * vlen1 * 8 / 1000000000
print("Approx. size of author wf embeddings with all words (single|double): {:,}GiB|{:,}GiB".format(max_sz_wfe_single, max_sz_wfe_double))

In [None]:
max_sz_wfe_single = ceil_nauth * vlen2 * 4 / 1000000000
max_sz_wfe_double = ceil_nauth * vlen2 * 8 / 1000000000
print("Approx. size of author wf embeddings with reduced vocab (single|double): {:,}GiB|{:,}GiB".format(max_sz_wfe_single, max_sz_wfe_double))

In [None]:
# Split author list by first 2 letters of last name
authors_split = [list(authnames) for letter, authnames in groupby(authors_collapsed, key=itemgetter(slice(0, 2)))]

In [None]:
# Number of different starting characters
num_first_letters = len(authors_split)
print("Number of unique first letters of author names = " + str(num_first_letters))

## Run tfidf vectorizer and perform matching on each author sub-list

In [None]:
# Save all find(match) results from the split author name list
fmatch_mat = []

for split_idx in range(num_first_letters):
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
    tf_idf_matrix = vectorizer.fit_transform(authors_split[split_idx])

    topN = len(authors_split[split_idx])
    matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), topN, 0.90)

    print(split_idx, tf_idf_matrix.shape, matches.nnz, (abs(matches-matches.T)>1e-10).nnz)

    tmp = find(matches)
    fmatch_mat.append(tmp)

In [None]:
author_to_authorId, authorId_to_author = assign_author_ids(authors_split, fmatch_mat)
with open(int_dir + 'author_to_authorId.pkl', 'wb') as f:
    pickle.dump(author_to_authorId, f)
with open(int_dir + 'authorId_to_author.pkl', 'wb') as f:
    pickle.dump(authorId_to_author, f)

In [None]:
print("Number of authors after disambiguation: " + str(len(authorId_to_author)))

In [None]:
# Initiate column in df for authorID
arxiv_data['authorID'] = np.nan

In [None]:
for index, row in arxiv_data.iterrows():
    if index % 10000 == 0:
        print(index)

    authors = row["authors"]

    # For each author, look up the author ID and create a list
    auth_ids_for_row = ''
    #auth_names_for_row = ''
    for auth in authors:
        auth = auth[0:20].strip()
        if re.search("^[a-zA-Z]", auth) is not None: # Throwaway all junk
            new_auth_id = author_to_authorId[auth]
            auth_ids_for_row = auth_ids_for_row + str(new_auth_id) + ','

    arxiv_data.loc[index, 'authorID'] = auth_ids_for_row

In [None]:
data_words = list(arxiv_data['text_processed'])
id2word = corpora.Dictionary(data_words) # Create dictionary of words, keyed by word ID
vlen3 = len(id2word)
print("Final vocab length: " + str(vlen3))
print("Final data_words length: " + str(len(data_words)))
if not os.path.exists(int_dir):
    os.makedirs(int_dir)

In [None]:
with open(int_dir + 'data_words.pkl', 'wb') as f:
    pickle.dump(data_words, f)

with open(int_dir + 'id2word.pkl', 'wb') as f:
    pickle.dump(id2word, f)

# Save the pkl file with all the added author ID information
arxiv_data.to_pickle(int_dir + 'main_df.pkl')

In [None]:
create_and_save_wordcloud(dset, dsub)

In [None]:
# Generate both the normal multigraphs and the edge-labeled multigraphs
# First is for testing, second is for visualization purposes
gen_nx_multigraphs_per_year(dset, dsub, overwrite=True)

In [None]:
# Generate complete hypergraph
H = gen_hypergraph(dset, dsub)

In [None]:
#hnx.drawing.draw(H)