In [1]:
import numpy as np
import pandas as pd
from gensim.models.phrases import Phraser
from tqdm import tqdm
from collections import defaultdict
from tqdm import tqdm

Our goal is now to take the preprocessed abstracts and identify the physics concepts within them.

In [2]:
df_arx = pd.read_csv('saved_files/arxiv_preprocessed.csv',names=["id","abstract","date"])
ab_arr = df_arx["abstract"].to_numpy()
print(len(ab_arr))
df_arx["date"] = pd.to_datetime(df_arx["date"])
year_arr = df_arx['date'].dt.year.to_numpy()

66839


As an example, this is a abstract after the preprocessing:

In [3]:
np.random.choice(ab_arr)

'possible construct closed quantum system governed bilinear hamiltonian depending arbitrary input signal achieved coupling quantum input field performing feedback output field cancel stochastic effect signal added field event later subtracted assume zero time delay limit connection operation'

In [3]:
df_atomic = pd.read_csv('saved_files/arxiv_atomic_concept.txt',names=["con"])
atomic_arr = np.array([con for con in df_atomic["con"].to_numpy()])

df_optic = pd.read_csv('saved_files/arxiv_optics_concept.txt',names=["con"])
optic_arr = np.array([con for con in df_optic["con"].to_numpy()])

df_quantum = pd.read_csv('saved_files/arxiv_quantum_concept.txt',names=["con"])
quantum_arr = np.array([con for con in df_quantum["con"].to_numpy()])

concept_compare_arr = np.unique(np.concatenate((atomic_arr,optic_arr,quantum_arr)))

In [5]:
# Create keyword lookup dictionary
keyword_lookup = defaultdict(list)
for keyword in concept_compare_arr:
    keyword_lookup[keyword].append(keyword)

# List to store modified abstracts
modified_ab_arr = []
matched_concepts = []

# Iterate through abstracts
for ab in tqdm(ab_arr):
    ab_tokens = ab.split()
    modified_ab_tokens = []
    i = 0
    while i < len(ab_tokens):
        found_sequence = False
        for j in range(6, 0, -1):  # Try different lengths of sequences in descending order
            if i + j <= len(ab_tokens):
                seq_tokens = ab_tokens[i:i + j]
                seq_ = ' '.join(seq_tokens)
                if seq_ in keyword_lookup:
                    for keyword in keyword_lookup[seq_]:
                        modified_ab_tokens.append('_'.join(seq_tokens))
                        matched_concepts.append(keyword.replace(' ', '_'))
                    i += j  # Move to the next position after the matched sequence
                    found_sequence = True
                    break
        if not found_sequence:
            modified_ab_tokens.append(ab_tokens[i])
            i += 1
    modified_ab_arr.append(' '.join(modified_ab_tokens))

100%|██████████| 66839/66839 [00:10<00:00, 6674.18it/s]


In [6]:
np.unique(matched_concepts).shape[0], concept_compare_arr.shape[0]

(24302, 33420)

In [7]:
def compute_word_count_subset(corpus, subset_words):
    for document in tqdm(corpus):
        for word in document:
            if word in subset_words:
                subset_words[word] += 1
    return subset_words

# Compute word count for the subset of words 
word_count_subset = compute_word_count_subset([row.split() for row in modified_ab_arr], {k:0 for k in np.unique(matched_concepts)})

cnt = 0 
filtered_arr = []
for k,v in word_count_subset.items():
    if v > 4:
        cnt += 1 
        filtered_arr.append(k)
cnt

100%|██████████| 66839/66839 [00:00<00:00, 270506.65it/s]


10235

In [8]:
np.save("saved_files/ngram_abstracts.npy",modified_ab_arr)
np.save("saved_files/overlapping_filtered_5_concepts.npy",np.unique(filtered_arr))
np.save("saved_files/year_arr.npy",year_arr)