In [1]:

import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import re

from gensim.models.phrases import Phrases, Phraser

import gensim
from tqdm import tqdm
from thefuzz import fuzz

Our goal is now to take the preprocessed 148564 abstracts and identify the physics concepts within them.

In [2]:
df_arx = pd.read_csv('files/arxiv_stop.csv',names=["id","abstract","date", "s_indc"])
ab_arr = df_arx["abstract"].to_numpy()
df_arx["date"] = pd.to_datetime(df_arx["date"])
year_arr = df_arx['date'].dt.year.to_numpy()
month_arr = df_arx['date'].dt.month.to_numpy()

In [3]:
len(ab_arr)

148564

As an example, this is a abstract after the preprocessing:

In [4]:
np.random.choice(ab_arr)

'   a property of a system be call actual if the observation of the test that pertain to that property yield an affirmation with certainty we formalize the act of observation by assume that the outcome correlate with the state of the observed system and be codify as an actual property of the state of the observer at the end of the measurement interaction for an actual property the observe outcome have to affirm that property with certainty hence in this case the correlation need to be perfect a property be call classical if either the property or its negation be actual it be show by a diagonal argument that there exist classical property of an observer that he can not observe perfectly because state be identify with the collection of property that be actual for that state it follow that no observer can perfectly observe his own state implication for the quantum measurement problem be briefly discuss'

This is the part where the second important design decision is made: 
The word2vec model we want to use later can only handle individual words.
In order to be able to process physics abstracts, which might contain more than one individual word, we want to represent them as n-grams.
I.e. the processed text should contain "schroedinger_equation" and not "schroedinger","equation".

gensim has a function which extracts common n-grams (max:4) from the text, given a min_count of 10 and some threshold.


In [5]:
def get_ngrams(sentences):
    """ Detects n-grams with n up to 4, and replaces those in the abstracts. """
    # Train a 2-word (bigram) phrase-detector
    bigram_phrases = gensim.models.phrases.Phrases(sentences,min_count=10,threshold=15)
    
    # And construct a phraser from that (an object that will take a sentence
    # and replace in it the bigrams that it knows by single objects)
    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    
    # Repeat that for trigrams; the input now are the bigrammed-titles
    ngram_phrases = gensim.models.phrases.Phrases(bigram[sentences],min_count=10,threshold=15)
    ngram         = gensim.models.phrases.Phraser(ngram_phrases)
    
    # !! If you want to have more than 4-grams, just repeat the structure of the
    #    above two lines. That is, train another Phrases on the ngram_phrases[titles],
    #    that will get you up to 8-grams. 
    
    # Now that we have phrasers for bi- and trigrams, let's analyze them
    # The phrases.export_phrases(x) function returns pairs of phrases and their
    # certainty scores from x.
    bigram_info = {}
    for b, score in bigram_phrases.export_phrases().items():
        bigram_info[b] = [score, bigram_info.get(b,[0,0])[1] + 1]
        
    ngram_info = {}
    for b, score in ngram_phrases.export_phrases().items():
        ngram_info[b] = [score, ngram_info.get(b,[0,0])[1] + 1]
            
    # Return a list of 'n-grammed' abtracts, and the bigram and trigram info
    return [ngram[t] for t in sentences], bigram_info, ngram_info

sentences = [row.split() for row in ab_arr]
ngram_abstracts, bigrams, ngrams = get_ngrams(sentences)

This gives us a list of 10565 ngrams which contain physics concepts as well as some natural language we are not interested in.

In [6]:
len(ngrams)

10565

In [7]:
np.random.choice(list(ngrams.keys()),size=10)

array(['representability_condition', 'non_prime', 'species_selective',
       'nnn_hopping', 'wigner_friend', 'off_resonantly', 'wave_packet',
       'self_kerr', 'nucleon_nucleon', 'secure_key_leasing'], dtype='<U54')

We now load a given list of suitable physics concepts:

In [8]:
df_atomic = pd.read_csv('files/arxiv_atomic_concept.txt',names=["con"])
atomic_arr = np.array([con.replace(" ", "_") for con in df_atomic["con"].to_numpy()])

df_optic = pd.read_csv('files/arxiv_optics_concept.txt',names=["con"])
optic_arr = np.array([con.replace(" ", "_") for con in df_optic["con"].to_numpy()])

df_quantum = pd.read_csv('files/arxiv_quantum_concept.txt',names=["con"])
quantum_arr = np.array([con.replace(" ", "_") for con in df_quantum["con"].to_numpy()])

concept_compare_arr = np.unique(np.concatenate((atomic_arr,optic_arr,quantum_arr)))

and do a naive preprecess, where we check if there are some direct matches:

In [9]:
pre_selec_inx = np.array([c in ngrams for c in concept_compare_arr]).astype(int)
concept_pre_compare_arr = concept_compare_arr[pre_selec_inx==1]
concept_continue_compare_arr = concept_compare_arr[pre_selec_inx==0]
print("Found ",concept_pre_compare_arr.shape[0],"entries directly") 
print("Check ",concept_continue_compare_arr.shape[0],"further")      

Found  2716 entries directly
Check  30703 further


The goal now is to find ngrams with an overlap with a concept which exceeds some threshold.

With this, we want to catch concepts which only differ in their grammar slightly, or contain an additional word.

For simplicity, I chose "partial_token_sort_ratio", which simply compares the letters in both strings to be compared. 
There probably exists a better choice here.



In [10]:
def find_overlapping_concepts(concepts1, concepts2, threshold=80):
    # concepts1 are the various ngrams that were extracted with no regard about physics
    # concepts2 are the physics concepts we are in general interested in
    # Goal of this function is to compare the overlap, at a given threshhold the concept is assumed to match
    overlapping_concepts = []
    non_overlapping_concepts = []

    for concept1 in concepts1:
        match = False 
        if "<" in concept1 or ">" in concept1:
            non_overlapping_concepts.append(concept1)
            continue 

        for concept2 in concepts2:

            # Use fuzz ratio to measure similarity
            
            similarity_ratio = fuzz.partial_token_sort_ratio(concept1, concept2)
           
            # Adjust the threshold based on your requirements
            if similarity_ratio >= threshold and len(concept1)>3:
                overlapping_concepts.append((concept1, concept2, similarity_ratio))
                match = True 
                break 
        if not match:
            non_overlapping_concepts.append(concept1) 

    return np.array(overlapping_concepts), np.array(non_overlapping_concepts)

# # Example usage:
concept_list1 = ["quantum_phase_transition", "electron", "other_concept","other_concept_<"]
concept_list2 = ["phase_transition", "electron_hole", "another_concept"]

result = find_overlapping_concepts(concept_list1, concept_list2)
print("Overlapping Concepts:", result[0])
print("Non-Overlapping Concepts:", result[1])


Overlapping Concepts: [['quantum_phase_transition' 'phase_transition' '81']
 ['electron' 'electron_hole' '100']]
Non-Overlapping Concepts: ['other_concept' 'other_concept_<']


In [11]:
# runs in about 6 minutes 
check_arr = np.array(list(ngrams.keys()))
overlapping_concepts, non_overlapping_concepts = find_overlapping_concepts(check_arr,concept_continue_compare_arr,threshold=80)
overlapping_concepts.shape, non_overlapping_concepts.shape

((8925, 3), (1640,))

Print a few overlapping concepts

In [12]:
for cnt,_ in enumerate(overlapping_concepts):
    print(_)
    if cnt == 30:
        break

['relationship_between' 'non_linear_relationship' '82']
['truth_value' 'ct_value' '88']
['arrive_at' 'arrival_time_delay' '80']
['relative_entropy' 'entropy_equation' '80']
['density_matrix' 'coefficient_matrix' '83']
['per_unit' 'circular_unit' '86']
['fascinating_phenomenon' 'gibbs_phenomenon' '86']
['basic_idea' 'adiabatic_demagnetization' '80']
['arise_naturally' 'natural_bias' '83']
['travel_wave' 'alfven_wave' '80']
['lamb_dicke_limit' 'band_limit' '82']
['jaynes_cumming_model' 'abc_model' '80']
['rotate_wave' 'achromatic_half_wave_plate' '84']
['rotate_wave_approximation' 'edge_wave' '80']
['non_rwa' 'energy_transfer_phenomenon' '86']
['spin_dependent' 'dependent_source' '88']
['trap_ion' 'adiabatic_population_transfer' '88']
['difference_between' 'absolute_difference' '84']
['information_processing' 'advanced_quantum_information_processing' '100']
['quantum_information_processing'
 'advanced_quantum_information_processing' '100']
['time_dependent' 'dependent_source' '83']
['har

Print a few non-overlapping concepts

In [13]:
np.random.choice(non_overlapping_concepts,size=30)

array(['bisognano_wichmann', 'not_necessarily', 'critically_examine',
       'worst_case_scenario', 'fulde_ferrell_<bra>',
       'realignment_criterion', 'down_convert', 'gallium_arsenide',
       'almost_all', 'most_important', 'su_<bra>', 'weakly_perturb',
       'd_=', 'merlin_arthur', 'freely_expand', 'ramsey_fringe',
       'leggett_type', 'do_not_contribute', 'affleck_kennedy_lieb_tasaki',
       'most_importantly', 'few_dozen', 'loschmidt_echo_<bra>', 'i_d',
       'closely_analogous', 'delegate_her', 'year_ago', '<num>_hz',
       'liu_et_al', 'the_aharonov_casher', 'rigorously_prove'],
      dtype='<U48')

In [14]:
np.save("files/overlapping_save_concepts.npy",np.unique(np.concatenate((overlapping_concepts[:,0],concept_pre_compare_arr))))
np.save("files/non_overlapping_save_concepts.npy",non_overlapping_concepts)

This gives us 9000 ngrams of which we now assume that they are valid physics concepts.

In [15]:
np.unique(np.concatenate((overlapping_concepts[:,0],concept_pre_compare_arr))).shape

(9000,)

In [16]:
ngram_abstracts_repl = ngram_abstracts.copy()
words_to_replace_set = set(non_overlapping_concepts)
cnt = 0 
for sublist in ngram_abstracts:
    for i in range(len(sublist)):
        if sublist[i] in words_to_replace_set:
            sublist[i] = sublist[i].replace("_", " ")
            cnt += 1 

We remove the non_overlapping_concepts by replacing "_" with " " in 9827 cases.

In [17]:
cnt

112534

In [18]:
np.save("files/ngram_abstracts_repl.npy",[' '.join(ab) for ab in ngram_abstracts_repl])

Print some n-grams to check results

In [19]:
phys_conc = set(np.unique(np.concatenate((overlapping_concepts[:,0],concept_pre_compare_arr))))

sortedns  = sorted( [(ngrams[b][0], ngrams[b][1], b) for b in ngrams.keys()] )[::-1]
print("Top bigrams by certainty:")
i = 0 
cnt = 0 
while cnt < 20:
    if sortedns[i][2] in phys_conc:
        print("{0:2}: {1:50} \t({2}) ".format(i+1, str(sortedns[i][2]), "%.2f"%sortedns[i][0]))
        cnt += 1 
    i+= 1



Top bigrams by certainty:
 1: whisper_gallery                                    	(49019608.00) 
 3: fabry_perot                                        	(20610517.00) 
 6: gell_mann                                          	(14854426.67) 
 7: retro_reflecte                                     	(13368984.00) 
10: henon_heile                                        	(11140820.00) 
11: vice_versa                                         	(10787143.17) 
16: jaynes_cumming                                     	(6183155.10) 
18: majumdar_ghosh                                     	(6127451.00) 
22: diffie_hellman                                     	(3713606.67) 
23: poschl_teller                                      	(3416518.13) 
24: reissner_nordstrom                                 	(3342246.00) 
27: coarse_graining                                    	(3062297.19) 
28: korteweg_de_vrie                                   	(2847098.44) 
29: randall_sundrum                                    	(2

In [20]:
phys_conc = set(np.unique(np.concatenate((overlapping_concepts[:,0],concept_pre_compare_arr))))

sortedns  = sorted( [(ngrams[b][0], ngrams[b][1], b) for b in ngrams.keys()] )#[::-1]
print("Least certain bigrams:")
i = 0 
cnt = 0 
while cnt < 20:
    if sortedns[i][2] in phys_conc:
        print("{0:2}: {1:50} \t({2}) ".format(i+1, str(sortedns[i][2]), "%.2f"%sortedns[i][0]))
        cnt += 1 
    i+= 1



Least certain bigrams:
 1: three_partite                                      	(15.00) 
 3: security_guarantee                                 	(15.00) 
 4: wigner_friend_scenario                             	(15.00) 
 5: micro_ring_resonator                               	(15.01) 
 6: relation_between                                   	(15.01) 
 7: maximally_entangle_state                           	(15.01) 
 8: relativistic_covariance                            	(15.01) 
 9: post_selection_procedure                           	(15.02) 
10: highly_entangled                                   	(15.02) 
12: singular_value_transformation                      	(15.03) 
13: majorana_bind                                      	(15.03) 
14: phase_shift                                        	(15.03) 
15: tripartite_ghz                                     	(15.03) 
18: completely_destroy                                 	(15.05) 
20: entanglement_witness                               	(15.05) 
21