In [1]:
import sent2vec
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.tokenize import  RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
from functools import lru_cache
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import matplotlib.ticker as mtick

from sqlitedict import SqliteDict
from tqdm import tqdm
import json

import string

import collections
from rouge_score import rouge_scorer

from collections import OrderedDict

import faiss

from rouge_score import scoring

from scipy.optimize import curve_fit
import scipy.interpolate as interp
from scipy.stats import f_oneway

import random

import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yingqiang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/yingqiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/yingqiang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
scinf_full_text_sections = SqliteDict('../SQLite/scinf-biomed-body-text-sections.sqlite')

In [3]:
scinf_global_pairs = SqliteDict('../SQLite/scinf-biomed-global-arguments.sqlite', autocommit=True)

In [4]:
scinf_local_pairs_indicator = SqliteDict('../SQLite/scinf-biomed-LAC-indicator.sqlite.sqlite', autocommit=True)

In [5]:
with open('../SQLite/paper_ids.json', 'r') as f:
    paper_ids = json.load(f)

In [6]:
method_strings = ['method', 'procedure', 'data', 'theory', 'theorem', 'implementation']
result_strings = ['result', 'outcome', 'analysis', 'measure', 'evaluation']
conclusion_strings = ['conclusion', 'summary', 'concluding', 'remark', 'key point']

### unigram precision

#### premises

In [7]:
section_pre_precisions = {"method":list(), "result":list(), "conclusion":list(), "other":list()}
section_pre_lengths = {"method":list(), "result":list(), "conclusion":list(), "other":list()}

for key in tqdm(paper_ids):
    
    try:
        local_premises = scinf_local_pairs_indicator[key]['premise']
        global_premises = scinf_global_pairs[key]['premise']

        local_front = scinf_local_pairs_indicator[key]['front']
        local_back = scinf_local_pairs_indicator[key]['back']

        sections = scinf_full_text_sections[key]['body_text']
        
        for section in sections:
            for pre in local_premises:
                if pre and pre in section['text']:
                    pre_common_words = list(set(word_tokenize(global_premises)) & set(word_tokenize("\n".join(pre) + "\n".join(local_front) + "\n".join(local_back))))
                    pre_common_words = [word.lower() for word in pre_common_words if word.lower() not in set(stopwords.words('english')) and word not in string.punctuation]
                    
                    local_pre_no_punc = [word.lower() for word in set(word_tokenize("\n".join(pre) + "\n".join(local_front) + "\n".join(local_back))) if word not in string.punctuation]
                    local_pre_no_punc = [word for word in local_pre_no_punc if word not in stopwords.words('english')]
                    pre_precision = len(pre_common_words) / len(local_pre_no_punc)

                    section_length_in_words = len(word_tokenize(section['text']))

                    if any(section_name in ''.join(section['section']).lower() for section_name in method_strings):
                        section_pre_precisions['method'].append(pre_precision)
                        section_pre_lengths['method'].append(section_length_in_words)
                        continue
                    elif any(section_name in ''.join(section['section']).lower() for section_name in result_strings):
                        section_pre_precisions['result'].append(pre_precision)
                        section_pre_lengths['result'].append(section_length_in_words)
                        continue
                    elif any(section_name in ''.join(section['section']).lower() for section_name in conclusion_strings):
                        section_pre_precisions['conclusion'].append(pre_precision)
                        section_pre_lengths['conclusion'].append(section_length_in_words)
                        continue
                    else:
                        section_pre_precisions['other'].append(pre_precision)
                        section_pre_lengths['other'].append(section_length_in_words)
                        continue
                else:
                    continue
                
    except KeyError:
        continue

100%|██████████| 27924/27924 [24:37<00:00, 18.90it/s]  


In [8]:
print(f"method: precision {np.mean(section_pre_precisions['method'])}, std. {np.std(section_pre_precisions['method'])}")
print(f"result: precision {np.mean(section_pre_precisions['result'])}, std. {np.std(section_pre_precisions['result'])}")
print(f"conclusion: precision {np.mean(section_pre_precisions['conclusion'])}, std. {np.std(section_pre_precisions['conclusion'])}")
print(f"other: precision {np.mean(section_pre_precisions['other'])}, std. {np.std(section_pre_precisions['other'])}")

method: precision 0.11449390562055678, std. 0.06350609961608047
result: precision 0.12828548288278405, std. 0.07142852547911879
conclusion: precision 0.12222056075190889, std. 0.0691167533556677
other: precision 0.11568171975723499, std. 0.061870236975449674


#### conclusions

In [9]:
section_con_precisions = {"method":list(), "result":list(), "conclusion":list(), "other":list()}
section_con_lengths = {"method":list(), "result":list(), "conclusion":list(), "other":list()}

for key in tqdm(paper_ids):
    #
    try:
        local_conclusions = scinf_local_pairs_indicator[key]['conclusion']
        global_conclusions = scinf_global_pairs[key]['conclusion']

        local_front = scinf_local_pairs_indicator[key]['front']
        local_back = scinf_local_pairs_indicator[key]['back']

        sections = scinf_full_text_sections[key]['body_text']
        
        for section in sections:
            for con in local_conclusions:
                if con and con in section['text']:
                    con_common_words = list(set(word_tokenize(global_conclusions)) & set(word_tokenize("\n".join(con) + "\n".join(local_front) + "\n".join(local_back))))
                    con_common_words = [word.lower() for word in con_common_words if word.lower() not in set(stopwords.words('english')) and word not in string.punctuation]
                    
                    #if len(con_common_words) != 0:
                    local_con_no_punc = [word.lower() for word in set(word_tokenize("\n".join(con) + "\n".join(local_front) + "\n".join(local_back))) if word not in string.punctuation]
                    local_con_no_punc = [word for word in local_con_no_punc if word not in stopwords.words('english')]
                    con_precision = len(con_common_words) / len(local_con_no_punc)
                    #else:
                        #continue

                    section_length_in_words = len(word_tokenize(section['text']))

                    if any(section_name in ''.join(section['section']).lower() for section_name in method_strings):
                        section_con_precisions['method'].append(con_precision)
                        section_con_lengths['method'].append(section_length_in_words)
                        continue
                    elif any(section_name in ''.join(section['section']).lower() for section_name in result_strings):
                        section_con_precisions['result'].append(con_precision)
                        section_con_lengths['result'].append(section_length_in_words)
                        continue
                    elif any(section_name in ''.join(section['section']).lower() for section_name in conclusion_strings):
                        section_con_precisions['conclusion'].append(con_precision)
                        section_con_lengths['conclusion'].append(section_length_in_words)
                        continue
                    else:
                        section_con_precisions['other'].append(con_precision)
                        section_con_lengths['other'].append(section_length_in_words)
                        continue
                else:
                    continue
                
    except KeyError:
        continue

100%|██████████| 27924/27924 [24:46<00:00, 18.78it/s] 


In [10]:
print(f"method: precision {np.mean(section_con_precisions['method'])}, std. {np.std(section_con_precisions['method'])}")
print(f"result: precision {np.mean(section_con_precisions['result'])}, std. {np.std(section_con_precisions['result'])}")
print(f"conclusion: precision {np.mean(section_con_precisions['conclusion'])}, std. {np.std(section_con_precisions['conclusion'])}")
print(f"other: precision {np.mean(section_con_precisions['other'])}, std. {np.std(section_con_precisions['other'])}")

method: precision 0.08114209798249236, std. 0.05229816307229417
result: precision 0.08512990396895426, std. 0.05471712854954438
conclusion: precision 0.10442115014288407, std. 0.06674194467369407
other: precision 0.08624275965657859, std. 0.05154516382528145


### bigram precision

#### conclusions

In [11]:
# compute precisions of bigrams in related sections

section_con_precisions = {"method":list(), "result":list(), "conclusion":list(), "other":list()}
section_con_lengths = {"method":list(), "result":list(), "conclusion":list(), "other":list()}

for key in tqdm(paper_ids):
    #
    try:
        local_conclusions = scinf_local_pairs_indicator[key]['conclusion']
        global_conclusions = scinf_global_pairs[key]['conclusion']

        local_front = scinf_local_pairs_indicator[key]['front']
        local_back = scinf_local_pairs_indicator[key]['back']

        sections = scinf_full_text_sections[key]['body_text']

        for section in sections:
            for con in local_conclusions:
                if con and con in section['text']:
                    
                    global_con_bigrams = nltk.bigrams(word_tokenize(global_conclusions))
                    local_con_bigrams = nltk.bigrams(word_tokenize("\n".join(con) + "\n".join(local_front) + "\n".join(local_back)))
        
                    global_con_bigrams = [bigram for bigram in global_con_bigrams]
                    local_con_bigrams = [bigram for bigram in local_con_bigrams]
        
                    con_common_bigrams = list(set(global_con_bigrams) & set(local_con_bigrams))
        
                    #if len(con_common_bigrams) != 0:
                    con_precision = len(con_common_bigrams) / len(local_con_bigrams)
                    #else:
                        #continue
            
                    section_length_in_words = len(word_tokenize(section['text']))
                    
                    if any(section_name in ''.join(section['section']).lower() for section_name in method_strings):
                        section_con_precisions['method'].append(con_precision)
                        section_con_lengths['method'].append(section_length_in_words)
                        continue

                    elif any(section_name in ''.join(section['section']).lower() for section_name in result_strings):
                        section_con_precisions['result'].append(con_precision)
                        section_con_lengths['result'].append(section_length_in_words)
                        continue
                    elif any(section_name in ''.join(section['section']).lower() for section_name in conclusion_strings):
                        section_con_precisions['conclusion'].append(con_precision)
                        section_con_lengths['conclusion'].append(section_length_in_words)
                        continue
                    else:
                        section_con_precisions['other'].append(con_precision)
                        section_con_lengths['other'].append(section_length_in_words)
                        continue
                else:
                    continue
    except KeyError:
        continue

100%|██████████| 27924/27924 [04:31<00:00, 102.91it/s]


In [12]:
print(f"method: precision {np.mean(section_con_precisions['method'])}, std. {np.std(section_con_precisions['method'])}")
print(f"result: precision {np.mean(section_con_precisions['result'])}, std. {np.std(section_con_precisions['result'])}")
print(f"conclusion: precision {np.mean(section_con_precisions['conclusion'])}, std. {np.std(section_con_precisions['conclusion'])}")
print(f"other: precision {np.mean(section_con_precisions['other'])}, std. {np.std(section_con_precisions['other'])}")

method: precision 0.021565040712589172, std. 0.027008427180399145
result: precision 0.023301509272271297, std. 0.029353950676067452
conclusion: precision 0.0302953680406899, std. 0.03657745762761869
other: precision 0.02203967592564201, std. 0.02530390036067794


#### premises

In [13]:
# compute precisions of bigrams in related sections

section_pre_precisions = {"method":list(), "result":list(), "conclusion":list(), "other":list()}
section_pre_lengths = {"method":list(), "result":list(), "conclusion":list(), "other":list()}

for key in tqdm(paper_ids):
    #
    try:
        local_premises = scinf_local_pairs_indicator[key]['premise']
        global_premises = scinf_global_pairs[key]['premise']

        local_front = scinf_local_pairs_indicator[key]['front']
        local_back = scinf_local_pairs_indicator[key]['back']

        sections = scinf_full_text_sections[key]['body_text']

        for section in sections:
            for pre in local_premises:
                if pre and pre in section['text']:
                    
                    global_pre_bigrams = nltk.bigrams(word_tokenize(global_premises))
                    local_pre_bigrams = nltk.bigrams(word_tokenize("\n".join(pre) + "\n".join(local_front) + "\n".join(local_back)))
        
                    global_pre_bigrams = [bigram for bigram in global_pre_bigrams]
                    local_pre_bigrams = [bigram for bigram in local_pre_bigrams]
        
                    pre_common_bigrams = list(set(global_pre_bigrams) & set(local_pre_bigrams))
        
                    #if len(local_pre_bigrams) != 0:
                    pre_precision = len(pre_common_bigrams) / len(local_pre_bigrams)
                    
                    section_length_in_words = len(word_tokenize(section['text']))
                    
                    if any(section_name in ''.join(section['section']).lower() for section_name in method_strings):
                        section_pre_precisions['method'].append(pre_precision)
                        section_pre_lengths['method'].append(section_length_in_words)
                        continue

                    elif any(section_name in ''.join(section['section']).lower() for section_name in result_strings):
                        section_pre_precisions['result'].append(pre_precision)
                        section_pre_lengths['result'].append(section_length_in_words)
                        continue
                    elif any(section_name in ''.join(section['section']).lower() for section_name in conclusion_strings):
                        section_pre_precisions['conclusion'].append(pre_precision)
                        section_pre_lengths['conclusion'].append(section_length_in_words)
                        continue
                    else:
                        section_pre_precisions['other'].append(pre_precision)
                        section_pre_lengths['other'].append(section_length_in_words)
                        continue
                else:
                    continue
    except KeyError:
        continue

100%|██████████| 27924/27924 [04:22<00:00, 106.18it/s]


In [14]:
print(f"method: precision {np.mean(section_pre_precisions['method'])}, std. {np.std(section_pre_precisions['method'])}")
print(f"result: precision {np.mean(section_pre_precisions['result'])}, std. {np.std(section_pre_precisions['result'])}")
print(f"conclusion: precision {np.mean(section_pre_precisions['conclusion'])}, std. {np.std(section_pre_precisions['conclusion'])}")
print(f"other: precision {np.mean(section_pre_precisions['other'])}, std. {np.std(section_pre_precisions['other'])}")

method: precision 0.030426325150739456, std. 0.029171707873956922
result: precision 0.035869280463603236, std. 0.03429453955434652
conclusion: precision 0.03321985859806792, std. 0.03611131788516471
other: precision 0.02955333565264894, std. 0.027970106894858123


### trigram precision

#### conclusions

In [15]:
# compute precisions of trigrams in related sections

section_con_precisions = {"method":list(), "result":list(), "conclusion":list(), "other":list()}
section_con_lengths = {"method":list(), "result":list(), "conclusion":list(), "other":list()}

for key in tqdm(paper_ids):
    #
    try:
        local_conclusions = scinf_local_pairs_indicator[key]['conclusion']
        global_conclusions = scinf_global_pairs[key]['conclusion']

        local_front = scinf_local_pairs_indicator[key]['front']
        local_back = scinf_local_pairs_indicator[key]['back']

        sections = scinf_full_text_sections[key]['body_text']

        for section in sections:
            for con in local_conclusions:
                if con and con in section['text']:
                    global_con_trigrams = nltk.ngrams(word_tokenize(global_conclusions), 3)
                    local_con_trigrams = nltk.ngrams(word_tokenize("\n".join(con) + "\n".join(local_front) + "\n".join(local_back)), 3)
        
                    global_con_trigrams = [trigram for trigram in global_con_trigrams]
                    local_con_trigrams = [trigram for trigram in local_con_trigrams]
        
                    con_common_trigrams = list(set(global_con_trigrams) & set(local_con_trigrams))
        
                    #if len(con_common_trigrams) >= 5:
                    con_precision = len(con_common_trigrams) / len(local_con_trigrams)
            
                    section_length_in_words = len(word_tokenize(section['text']))
                    
                    if any(section_name in ''.join(section['section']).lower() for section_name in method_strings):
                        section_con_precisions['method'].append(con_precision)
                        section_con_lengths['method'].append(section_length_in_words)
                        continue

                    elif any(section_name in ''.join(section['section']).lower() for section_name in result_strings):
                        section_con_precisions['result'].append(con_precision)
                        section_con_lengths['result'].append(section_length_in_words)
                        continue
                    elif any(section_name in ''.join(section['section']).lower() for section_name in conclusion_strings):
                        section_con_precisions['conclusion'].append(con_precision)
                        section_con_lengths['conclusion'].append(section_length_in_words)
                        continue
                    else:
                        section_con_precisions['other'].append(con_precision)
                        section_con_lengths['other'].append(section_length_in_words)
                        continue
                else:
                    continue
    except KeyError:
        continue

100%|██████████| 27924/27924 [04:28<00:00, 103.89it/s]


In [16]:
print(f"method: precision {np.mean(section_con_precisions['method'])}, std. {np.std(section_con_precisions['method'])}")
print(f"result: precision {np.mean(section_con_precisions['result'])}, std. {np.std(section_con_precisions['result'])}")
print(f"conclusion: precision {np.mean(section_con_precisions['conclusion'])}, std. {np.std(section_con_precisions['conclusion'])}")
print(f"other: precision {np.mean(section_con_precisions['other'])}, std. {np.std(section_con_precisions['other'])}")

method: precision 0.008853747902477243, std. 0.02214665310223368
result: precision 0.010098362322544538, std. 0.025127175236043926
conclusion: precision 0.016886950630968307, std. 0.03347478225399961
other: precision 0.009301942823263857, std. 0.020883408764756042


#### premises

In [17]:
# compute precisions of trigrams in related sections

section_pre_precisions = {"method":list(), "result":list(), "conclusion":list(), "other":list()}
section_pre_lengths = {"method":list(), "result":list(), "conclusion":list(), "other":list()}

for key in tqdm(paper_ids):
    #
    try:
        local_premises = scinf_local_pairs_indicator[key]['premise']
        global_premises = scinf_global_pairs[key]['premise']

        local_front = scinf_local_pairs_indicator[key]['front']
        local_back = scinf_local_pairs_indicator[key]['back']

        sections = scinf_full_text_sections[key]['body_text']

        
        for section in sections:
            for pre in local_premises:    
                if pre and pre in section['text']:
                    global_pre_trigrams = nltk.ngrams(word_tokenize(global_premises), 3)
                    local_pre_trigrams = nltk.ngrams(word_tokenize("\n".join(pre) + "\n".join(local_front) + "\n".join(local_back)), 3)
        
                    global_pre_trigrams = [trigram for trigram in global_pre_trigrams]
                    local_pre_trigrams = [trigram for trigram in local_pre_trigrams]
        
                    pre_common_trigrams = list(set(global_pre_trigrams) & set(local_pre_trigrams))
        
                    #if len(local_pre_trigrams) != 0:
                    pre_precision = len(pre_common_trigrams) / len(local_pre_trigrams)
                    #else:
                        #continue
            
                    section_length_in_words = len(word_tokenize(section['text']))
                    
                    if any(section_name in ''.join(section['section']).lower() for section_name in method_strings):
                        section_pre_precisions['method'].append(pre_precision)
                        section_pre_lengths['method'].append(section_length_in_words)
                        continue

                    elif any(section_name in ''.join(section['section']).lower() for section_name in result_strings):
                        section_pre_precisions['result'].append(pre_precision)
                        section_pre_lengths['result'].append(section_length_in_words)
                        continue
                    elif any(section_name in ''.join(section['section']).lower() for section_name in conclusion_strings):
                        section_pre_precisions['conclusion'].append(pre_precision)
                        section_pre_lengths['conclusion'].append(section_length_in_words)
                        continue
                    else:
                        section_pre_precisions['other'].append(pre_precision)
                        section_pre_lengths['other'].append(section_length_in_words)
                        continue
                else:
                    continue
    except KeyError:
        continue

100%|██████████| 27924/27924 [04:21<00:00, 106.71it/s]


In [18]:
print(f"method: precision {np.mean(section_pre_precisions['method'])}, std. {np.std(section_pre_precisions['method'])}")
print(f"result: precision {np.mean(section_pre_precisions['result'])}, std. {np.std(section_pre_precisions['result'])}")
print(f"conclusion: precision {np.mean(section_pre_precisions['conclusion'])}, std. {np.std(section_pre_precisions['conclusion'])}")
print(f"other: precision {np.mean(section_pre_precisions['other'])}, std. {np.std(section_pre_precisions['other'])}")

method: precision 0.010974766931433738, std. 0.020821531604121705
result: precision 0.013895545278966803, std. 0.02668431670893354
conclusion: precision 0.013490445610424985, std. 0.029228161405617473
other: precision 0.010520794733471365, std. 0.0199192219535281
