In [1]:
import numpy as np
import pandas as pd
from gensim.models.phrases import Phraser
import gensim
from tqdm import tqdm
from thefuzz import fuzz
from collections import defaultdict
from tqdm import tqdm

Our goal is now to take the preprocessed abstracts and identify the physics concepts within them.

In [2]:
df_arx = pd.read_csv('saved_files/arxiv_preprocessed.csv',names=["id","abstract","date"])
ab_arr = df_arx["abstract"].to_numpy()
df_arx["date"] = pd.to_datetime(df_arx["date"])
year_arr = df_arx['date'].dt.year.to_numpy()

As an example, this is a abstract after the preprocessing:

In [4]:
np.random.choice(ab_arr)

'light transport dense disordered cold atomic ensemble cooperation atomic dipole essentially modifies coupling radiation mode offer alternative approach light matter interfacing protocol cooperativity quasi static dipole interaction affect process light propagation condition electromagnetically induced transparency eit perform comparative analysis self consistent approach ab initio microscopic calculation emphasize role interatomic interaction dipolesdynamics result dense strongly disordered eit based light storage protocol stay relatively insensitive configuration variation obtained essentially atom normally needed dilute configuration'

In [5]:
df_atomic = pd.read_csv('saved_files/arxiv_atomic_concept.txt',names=["con"])
atomic_arr = np.array([con for con in df_atomic["con"].to_numpy()])

df_optic = pd.read_csv('saved_files/arxiv_optics_concept.txt',names=["con"])
optic_arr = np.array([con for con in df_optic["con"].to_numpy()])

df_quantum = pd.read_csv('saved_files/arxiv_quantum_concept.txt',names=["con"])
quantum_arr = np.array([con for con in df_quantum["con"].to_numpy()])

concept_compare_arr = np.unique(np.concatenate((atomic_arr,optic_arr,quantum_arr)))

In [6]:
# Create keyword lookup dictionary
keyword_lookup = defaultdict(list)
for keyword in concept_compare_arr:
    keyword_lookup[keyword].append(keyword)

# List to store modified abstracts
modified_ab_arr = []
matched_concepts = []

# Iterate through abstracts
for ab in tqdm(ab_arr):
    ab_tokens = ab.split()
    modified_ab_tokens = []
    i = 0
    while i < len(ab_tokens):
        found_sequence = False
        for j in range(6, 0, -1):  # Try different lengths of sequences in descending order
            if i + j <= len(ab_tokens):
                seq_tokens = ab_tokens[i:i + j]
                seq_ = ' '.join(seq_tokens)
                if seq_ in keyword_lookup:
                    for keyword in keyword_lookup[seq_]:
                        modified_ab_tokens.append('_'.join(seq_tokens))
                        matched_concepts.append(keyword.replace(' ', '_'))
                    i += j  # Move to the next position after the matched sequence
                    found_sequence = True
                    break
        if not found_sequence:
            modified_ab_tokens.append(ab_tokens[i])
            i += 1
    modified_ab_arr.append(' '.join(modified_ab_tokens))

100%|██████████| 157821/157821 [00:33<00:00, 4657.79it/s]


In [8]:
np.unique(matched_concepts).shape[0]/ quantum_arr.shape[0]

1.1036312323612418

In [16]:
def compute_word_count_subset(corpus, subset_words):
    
    for document in tqdm(corpus):
        for word in document:
            if word in subset_words:
                subset_words[word] += 1
    return subset_words

# Compute word count for the subset of words 
word_count_subset = compute_word_count_subset([row.split() for row in modified_ab_arr], {k:0 for k in np.unique(matched_concepts)})

cnt = 0 
filtered_arr = []
for k,v in word_count_subset.items():
    if v > 4:
        cnt += 1 
        filtered_arr.append(k)
cnt

100%|██████████| 157821/157821 [00:00<00:00, 165763.68it/s]


19161

In [17]:
np.save("saved_files/ngram_abstracts.npy",modified_ab_arr)
np.save("saved_files/overlapping_concepts.npy",np.unique(matched_concepts))
np.save("saved_files/overlapping_filtered_5_concepts.npy",np.unique(filtered_arr))
np.save("saved_files/year_arr.npy",year_arr)

In [None]:
# def get_ngrams(sentences):
#     """ Detects n-grams with n up to 4, and replaces those in the abstracts. """
#     # Train a 2-word (bigram) phrase-detector
#     bigram_phrases = gensim.models.phrases.Phrases(sentences,min_count=5,threshold=10)
    
#     # And construct a phraser from that (an object that will take a sentence
#     # and replace in it the bigrams that it knows by single objects)
#     bigram = gensim.models.phrases.Phraser(bigram_phrases)
    
#     # Repeat that for trigrams; the input now are the bigrammed-titles
#     ngram_phrases = gensim.models.phrases.Phrases(bigram[sentences],min_count=5,threshold=10)
#     ngram         = gensim.models.phrases.Phraser(ngram_phrases)
    
#     # !! If you want to have more than 4-grams, just repeat the structure of the
#     #    above two lines. That is, train another Phrases on the ngram_phrases[titles],
#     #    that will get you up to 8-grams. 
    
#     # Now that we have phrasers for bi- and trigrams, let's analyze them
#     # The phrases.export_phrases(x) function returns pairs of phrases and their
#     # certainty scores from x.
#     bigram_info = {}
#     for b, score in bigram_phrases.export_phrases().items():
#         bigram_info[b] = [score, bigram_info.get(b,[0,0])[1] + 1]
#         len
#     ngram_info = {}
#     for b, score in ngram_phrases.export_phrases().items():
#         ngram_info[b] = [score, ngram_info.get(b,[0,0])[1] + 1]
    
#     # Return a list of 'n-grammed' abtracts, and the bigram and trigram info
#     return [ngram[t] for t in sentences], bigram_info, ngram_info

# sentences = [row.split() for row in ab_arr]
# ngram_abstracts, bigrams, ngrams = get_ngrams(sentences)