In [1]:
import numpy as np
import pandas as pd
from gensim.models.phrases import Phraser
import gensim
from tqdm import tqdm
from thefuzz import fuzz
from collections import defaultdict
from tqdm import tqdm

Our goal is now to take the preprocessed 148564 abstracts and identify the physics concepts within them.

In [2]:
df_arx = pd.read_csv('files/arxiv_preprocessed.csv',names=["id","abstract","date"])
ab_arr = df_arx["abstract"].to_numpy()
df_arx["date"] = pd.to_datetime(df_arx["date"])
year_arr = df_arx['date'].dt.year.to_numpy()
month_arr = df_arx['date'].dt.month.to_numpy()

In [3]:
len(ab_arr)

157821

As an example, this is a abstract after the preprocessing:

In [4]:
np.random.choice(ab_arr)

'trapped bose einstein condensate subject action alternating external field coherent topological mode resonantly excited depending amplitude external field detuning parameter principally different regime motion mode locking change dynamic regime corresponds dynamic phase transition transition characterized effective order parameter defined difference fractional mode population averaged temporal period oscillation behavior order parameter function detuning pumping amplitude atomic interaction carefully analyzed special attention payed numerical calculation realistic case quadrupole exciting field parameter accessible current experiment'

In [5]:
df_atomic = pd.read_csv('files/arxiv_atomic_concept.txt',names=["con"])
atomic_arr = np.array([con for con in df_atomic["con"].to_numpy()])

df_optic = pd.read_csv('files/arxiv_optics_concept.txt',names=["con"])
optic_arr = np.array([con for con in df_optic["con"].to_numpy()])

df_quantum = pd.read_csv('files/arxiv_quantum_concept.txt',names=["con"])
quantum_arr = np.array([con for con in df_quantum["con"].to_numpy()])

concept_compare_arr = np.unique(np.concatenate((atomic_arr,optic_arr,quantum_arr)))

In [6]:
# Create keyword lookup dictionary
keyword_lookup = defaultdict(list)
for keyword in concept_compare_arr:
    keyword_lookup[keyword].append(keyword)

# List to store modified abstracts
modified_ab_arr = []
matched_concepts = []

# Iterate through abstracts
for ab in tqdm(ab_arr):
    ab_tokens = ab.split()
    modified_ab_tokens = []
    i = 0
    while i < len(ab_tokens):
        found_sequence = False
        for j in range(6, 0, -1):  # Try different lengths of sequences in descending order
            if i + j <= len(ab_tokens):
                seq_tokens = ab_tokens[i:i + j]
                seq_ = ' '.join(seq_tokens)
                if seq_ in keyword_lookup:
                    for keyword in keyword_lookup[seq_]:
                        modified_ab_tokens.append('_'.join(seq_tokens))
                        matched_concepts.append(keyword.replace(' ', '_'))
                    i += j  # Move to the next position after the matched sequence
                    found_sequence = True
                    break
        if not found_sequence:
            modified_ab_tokens.append(ab_tokens[i])
            i += 1
    modified_ab_arr.append(' '.join(modified_ab_tokens))

100%|██████████| 157821/157821 [00:24<00:00, 6548.79it/s]


In [7]:
np.unique(matched_concepts).shape[0]/ quantum_arr.shape[0]

1.1036312323612418

In [8]:
np.save("files/ngram_abstracts.npy",modified_ab_arr)
np.save("files/overlapping_concepts.npy",np.unique(matched_concepts))

In [9]:
np.random.choice(modified_ab_arr)

'address question presence kerr_nonlinearity multiple_scattering optical_medium offer advantage respect design physical_unclonable_function result suggest certain condition nonlinear physical_unclonable_function robust potential cloning medium relative linear counterpart exploited context cryptographic application'

In [10]:
np.unique(matched_concepts).shape[0]/ quantum_arr.shape[0]

1.1036312323612418

In [15]:
def compute_word_count_subset(corpus, subset_words):
    
    for document in tqdm(corpus):
        for word in document:
            if word in subset_words:
                subset_words[word] += 1
    return subset_words

# Compute word count for the subset of words 
word_count_subset = compute_word_count_subset([row.split() for row in modified_ab_arr], {k:0 for k in np.unique(matched_concepts)})

cnt = 0 
filtered_arr = []
for k,v in word_count_subset.items():
    if v>10:
        cnt += 1 
        filtered_arr.append(k)
cnt

100%|██████████| 157821/157821 [00:00<00:00, 206845.50it/s]


12770

In [17]:

np.save("files/overlapping_filtered_concepts.npy",np.unique(filtered_arr))

In [12]:
# np.save("files/ngram_abstracts.npy",ab_arr.tolist())
# np.save("files/overlapping_concepts.npy",np.unique(found_concept_list))
# np.save("files/year_arr.npy",year_arr)
# np.save("files/month_arr.npy",month_arr)

In [13]:
stop

NameError: name 'stop' is not defined

In [None]:
def get_ngrams(sentences):
    """ Detects n-grams with n up to 4, and replaces those in the abstracts. """
    # Train a 2-word (bigram) phrase-detector
    bigram_phrases = gensim.models.phrases.Phrases(sentences,min_count=5,threshold=10)
    
    # And construct a phraser from that (an object that will take a sentence
    # and replace in it the bigrams that it knows by single objects)
    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    
    # Repeat that for trigrams; the input now are the bigrammed-titles
    ngram_phrases = gensim.models.phrases.Phrases(bigram[sentences],min_count=5,threshold=10)
    ngram         = gensim.models.phrases.Phraser(ngram_phrases)
    
    # !! If you want to have more than 4-grams, just repeat the structure of the
    #    above two lines. That is, train another Phrases on the ngram_phrases[titles],
    #    that will get you up to 8-grams. 
    
    # Now that we have phrasers for bi- and trigrams, let's analyze them
    # The phrases.export_phrases(x) function returns pairs of phrases and their
    # certainty scores from x.
    bigram_info = {}
    for b, score in bigram_phrases.export_phrases().items():
        bigram_info[b] = [score, bigram_info.get(b,[0,0])[1] + 1]
        len
    ngram_info = {}
    for b, score in ngram_phrases.export_phrases().items():
        ngram_info[b] = [score, ngram_info.get(b,[0,0])[1] + 1]
    
    # Return a list of 'n-grammed' abtracts, and the bigram and trigram info
    return [ngram[t] for t in sentences], bigram_info, ngram_info

sentences = [row.split() for row in ab_arr]
ngram_abstracts, bigrams, ngrams = get_ngrams(sentences)

In [None]:
found_concept_dict = {k:1 for k in found_concept_list}

In [None]:
cnt = 0 
for n in tqdm(bigrams.keys()):
    if n in found_concept_dict:
        cnt += 1 

for n in tqdm(ngrams.keys()):
    if n in found_concept_dict:
        cnt += 1 
cnt

100%|██████████| 30078/30078 [00:00<00:00, 1679530.79it/s]


100%|██████████| 43990/43990 [00:00<00:00, 1608945.49it/s]


9640

This gives us a list of 10565 ngrams which contain physics concepts as well as some natural language we are not interested in.

We now load a given list of suitable physics concepts:

In [None]:
df_atomic = pd.read_csv('files/arxiv_atomic_concept.txt',names=["con"])
atomic_arr = np.array([con.replace(" ", "_") for con in df_atomic["con"].to_numpy()])

df_optic = pd.read_csv('files/arxiv_optics_concept.txt',names=["con"])
optic_arr = np.array([con.replace(" ", "_") for con in df_optic["con"].to_numpy()])

df_quantum = pd.read_csv('files/arxiv_quantum_concept.txt',names=["con"])
quantum_arr = np.array([con.replace(" ", "_") for con in df_quantum["con"].to_numpy()])

concept_compare_arr = np.unique(np.concatenate((atomic_arr,optic_arr,quantum_arr)))

In [None]:
np.unique(quantum_arr).shape

(26575,)

and do a naive preprecess, where we check if there are some direct matches:

In [None]:
# pre_selec_inx = np.array([c in ngrams for c in concept_compare_arr]).astype(int)
pre_selec_inx = np.array([c in concept_compare_arr for c in list(ngrams.keys())]).astype(int)


concept_pre_compare_arr = np.array(list(ngrams.keys()))[pre_selec_inx==1]
concept_continue_compare_arr = np.array(list(ngrams.keys()))[pre_selec_inx==0]
print("Found ",concept_pre_compare_arr.shape[0],"entries directly") 
print("Check ",concept_continue_compare_arr.shape[0],"further")      

inx_arr = np.array([1 if any(symbol in concept for symbol in ['<', '>', '(', ')', '#', '~', '*', '=', '[', ']']) else 0 
           for concept in concept_continue_compare_arr])


concept_continue_compare_arr = np.unique(concept_continue_compare_arr[inx_arr==0])


Found  5883 entries directly
Check  38108 further


In [None]:
from joblib import Parallel, delayed
# import numpy as np
# from tqdm import tqdm
# from fuzzywuzzy import fuzz

def process_concept(concept1, concepts2, threshold):
    match = False
    if any(char in concept1 for char in ['<', '>', '(', ')', '#', '~', '*', '=', "[", "]"]):
        return concept1, None

    similarity_ratio = 0  # Initialize similarity_ratio
    for concept2 in concepts2:
        current_ratio = fuzz.partial_token_sort_ratio(concept1, concept2)
        if current_ratio >= threshold and len(concept1) > 3:
            match = True
            similarity_ratio = current_ratio  # Update similarity_ratio
            return concept1, concept2, similarity_ratio
    if not match:
        return concept1, None, similarity_ratio  # Return similarity_ratio

def find_overlapping_concepts(concepts1, concepts2, threshold):
    # Parallelize the processing of concepts1
    results = Parallel(n_jobs=-1)(delayed(process_concept)(concept1, concepts2, threshold) for concept1 in tqdm(concepts1))
    overlapping_concepts = [(concept1, concept2, similarity_ratio) for concept1, concept2, similarity_ratio in results if concept2 is not None]
    non_overlapping_concepts = [concept1 for concept1, concept2, similarity_ratio in results if concept2 is None]
    return np.array(overlapping_concepts), np.array(non_overlapping_concepts)

# Example usage
overlapping_concepts, non_overlapping_concepts = find_overlapping_concepts(concept_continue_compare_arr, concept_compare_arr, threshold=95)
print(overlapping_concepts.shape, non_overlapping_concepts.shape)

100%|██████████| 38108/38108 [16:27<00:00, 38.59it/s]


(7719, 3) (30389,)


In [None]:
np.save("files/ngram_abstracts.npy",[' '.join(ab) for ab in ngram_abstracts])
np.save("files/overlapping_concepts.npy",np.unique(np.concatenate((concept_pre_compare_arr, overlapping_concepts[:,0]))))
np.save("files/year_arr.npy",year_arr)
np.save("files/month_arr.npy",month_arr)

In [None]:
# def find_overlapping_concepts(concepts1, concepts2, threshold):
#     # concepts1 are the various ngrams that were extracted with no regard about physics
#     # concepts2 are the physics concepts we are in general interested in
#     # Goal of this function is to compare the overlap, at a given threshhold the concept is assumed to match
#     overlapping_concepts = []
#     non_overlapping_concepts = []

#     for concept1 in tqdm(concepts1):
#         match = False 
#         if any(char in concept1 for char in ['<', '>', '(', ')','#', '~', '*', '=',"[","]"]):
#             non_overlapping_concepts.append(concept1)
#             continue 

#         for concept2 in concepts2:

#             # Use fuzz ratio to measure similarity
            
#             similarity_ratio = fuzz.partial_token_sort_ratio(concept1, concept2)
           
#             # Adjust the threshold based on your requirements
#             if similarity_ratio >= threshold and len(concept1)>3:
#                 overlapping_concepts.append((concept1, concept2, similarity_ratio))
#                 match = True 
#                 break 
#         if not match:
#             non_overlapping_concepts.append(concept1) 

#     return np.array(overlapping_concepts), np.array(non_overlapping_concepts)


# overlapping_concepts, non_overlapping_concepts = find_overlapping_concepts(concept_continue_compare_arr,concept_compare_arr,threshold=90)
# overlapping_concepts.shape, non_overlapping_concepts.shape


In [None]:
for cnt,_ in enumerate(overlapping_concepts):
    if "matrix" in _[1]:
        print(_)
        # break

['adjacency_laplacian_matrix' 'laplacian_matrix' '100']
['adjacency_matrix_model' 'adjacency_matrix' '100']
['approximated_matrix_product' 'matrix_product' '100']
['bistochastic_map' 'bistochastic_matrix' '97']
['complex_hadamard' 'complex_hadamard_matrix' '100']
['corner_transfer_matrix' 'transfer_matrix' '100']
['density_matrix_exponentiation' 'matrix_exponentiation' '100']
['densitymatrix_element' 'density_matrix' '96']
['densitymatrix_renormalization' 'density_matrix' '96']
['densitymatrix_renormalization_group' 'density_matrix' '96']
['densitymatrix_renormalization_group_dmrg' 'density_matrix' '96']
['densitymatrix_renormalizationgroup' 'density_matrix' '96']
['densitymatrix_simulation' 'density_matrix' '96']
['diagonal_element' 'diagonal_density_matrix_element' '100']
['diagonal_matrix_element' 'diagonal_density_matrix_element' '100']
['dipole_matrix_element' 'dipole_transition_matrix_element' '100']
['inclusive_scattering_matrix' 'scatter_matrix' '100']
['injective_matrix_produc

In [None]:
np.random.choice(non_overlapping_concepts,size=30)

array(['multilevel_atom', 'ohmic_spectral', 'box_trap',
       'significantly_fewer', 'significantly_improve',
       'purely_dispersive', 'sachdevyekitaev_syk_model',
       'exponential_growth', 'tightbinding_lattice',
       'depends_sensitively', 'achieving_highfidelity',
       'ubiquitous_nature', 'kicking_strength', 'array_fpga',
       'uniformly_bounded', 'outofplane_magnetic',
       'photonnumber_resolving', 'criticism_raised', 'nuclear_physic',
       'computed_analytically', 'diode_spad', 'singlephoton_pulse',
       'guarantee_existence', 'relatively_insensitive', 'otimes_otimes',
       'provide_pedagogical', 'timeevolving_block', 'markov_process',
       'despite_decade', 'quasiperiodic_disorder'], dtype='<U51')

In [None]:
p

NameError: name 'p' is not defined

In [None]:
# convert your array into a dataframe
# df = pd.DataFrame(np.concatenate((np.sort(concept_continue_compare_arr).reshape(-1,1),np.sort(concept_continue_compare_arr).reshape(-1,1)),axis=-1))  
df = pd.DataFrame(np.concatenate((np.sort(overlapping_concepts[:,0]).reshape(-1,1),np.sort(overlapping_concepts[:,0]).reshape(-1,1)),axis=-1))
filepath = 'check_concept.csv'
df.to_csv(filepath, index=False)

# Takes the hand inspected .csv file and extracts concepts

In [None]:

df_repl = pd.read_csv("my_excel_file.csv")
repl_arr = df_repl.to_numpy()
non_phys_arr = []
replace_phys_arr = {}
match_phys_arr = []
cnt = 0 
for tupel in repl_arr:
    if tupel[1]=="-":
        non_phys_arr.append(tupel[0])
        # remove from phys concept list
    elif tupel[0] != tupel[1]:
        replace_phys_arr[tupel[0]] = tupel[1] 
        
    else:
        match_phys_arr.append(tupel[1])
        
        # replace in all abstracts and add to phys concept list


ngram_abstracts_repl = ngram_abstracts.copy()
cnt = 0 
for sublist in ngram_abstracts:
    for i in range(len(sublist)):
        if sublist[i] in replace_phys_arr:
            sublist[i] = replace_phys_arr[sublist[i]]
            cnt += 1 

np.save("files/ngram_abstracts_repl.npy",[' '.join(ab) for ab in ngram_abstracts_repl])
np.save("files/overlapping_save_concepts.npy",np.unique(np.concatenate((concept_pre_compare_arr,match_phys_arr,list(replace_phys_arr.values())))))
np.save("files/non_overlapping_save_concepts.npy",non_phys_arr)

In [None]:
# ngram_abstracts_repl = ngram_abstracts.copy()
# words_to_replace_set = set(non_overlapping_concepts)
# cnt = 0 
# for sublist in ngram_abstracts:
#     for i in range(len(sublist)):
#         if sublist[i] in words_to_replace_set:
#             sublist[i] = sublist[i].replace("_", " ")
#             cnt += 1 

# np.save("files/ngram_abstracts_repl.npy",[' '.join(ab) for ab in ngram_abstracts_repl])

In [None]:
# phys_conc = set(np.unique(np.concatenate((overlapping_concepts[:,0],concept_pre_compare_arr))))

# sortedns  = sorted( [(ngrams[b][0], ngrams[b][1], b) for b in ngrams.keys()] )[::-1]
# print("Top bigrams by certainty:")
# i = 0 
# cnt = 0 
# while cnt < 20:
#     if sortedns[i][2] in phys_conc:
#         print("{0:2}: {1:50} \t({2}) ".format(i+1, str(sortedns[i][2]), "%.2f"%sortedns[i][0]))
#         cnt += 1 
#     i+= 1

In [None]:
# phys_conc = set(np.unique(np.concatenate((overlapping_concepts[:,0],concept_pre_compare_arr))))

# sortedns  = sorted( [(ngrams[b][0], ngrams[b][1], b) for b in ngrams.keys()] )#[::-1]
# print("Least certain bigrams:")
# i = 0 
# cnt = 0 
# while cnt < 20:
#     if sortedns[i][2] in phys_conc:
#         print("{0:2}: {1:50} \t({2}) ".format(i+1, str(sortedns[i][2]), "%.2f"%sortedns[i][0]))
#         cnt += 1 
#     i+= 1