# Semantic Scholar Scraping for Summarisation Literature Review

<b>url:</b> http://s2-public-api-prod.us-west-2.elasticbeanstalk.com/corpus/download/

In [21]:
from tqdm import tqdm
import gzip
import json
from langdetect import detect

In [2]:
manifestFile = open("../data/sample/semantic scholar manifest 2020-03.txt", "r")
manifestFile = manifestFile.read()
manifestFileList = manifestFile.split('\n')
manifestFileList = [file for file in manifestFileList if 's2' in file]

## Extract Data

In [4]:
s2CorpusUrl = '../data/s2-corpus-000.gz'    #'s2-corpus/sample-S2-records.gz'

In [5]:
def unzipS2Contents(url):
    f = gzip.open(url, 'rb')
    file_content = f.read().decode('utf-8')
    f.close()
    fileContentsList = file_content.split('\n')
    return fileContentsList

In [6]:
%%time
fileContentsList = unzipS2Contents(s2CorpusUrl)

Wall time: 56.3 s


In [7]:
len(fileContentsList)

999674

In [9]:
fileContentsList[3]

'{"entities":[],"journalVolume":"","journalPages":"","pmid":"","fieldsOfStudy":["Physics"],"year":2015,"outCitations":["2497ed63572e8d5e5fe7945f0b23e0d090acd51c","03b317054274da28acfb2c8e082f38d7dcfdce04","070c58ff3d4f5ca3383c20a23af3594ae6e564ab","f9a1951720cafa3706b341c0d14ddd57d9c83043","26052227014c270c3f6013d98fdb8db1b80f8607","8de63b8021633e45585874468ff5fe4bfe3ee476","91d9b8d56ce67a90abfe0c9fc7483b8220ad3c66","5baead167bceac9bdcbd7ac808620bb8987da323","778f2e33cb7b0dfc0d3925df852fa4e576e75890","88a11402d59f026ae5cd93f044e0c038f4373d51","c2038b5d11a4dd9017d7a410b93f088f4dc8d1e4","fa1aff91383e227fc115fc0621fd7452ebca46ab"],"s2Url":"https://semanticscholar.org/paper/1b2f4e5be76a0a746b72110b447b42fffa046b5c","s2PdfUrl":"","id":"1b2f4e5be76a0a746b72110b447b42fffa046b5c","authors":[{"name":"Xiang Fa Liu","ids":["153201706"]},{"name":"Guodong Xia","ids":["46932503"]},{"name":"Guo-zhen Yang","ids":["50147063"]}],"journalName":"","paperAbstract":"Abstract Experimental investigations on t

In [80]:
keyTermsList = ['summarisation', 'summarization', 'nlg', 'extractive', 'summeries']    # spelling problems...; removed automatic as it's too general when it matches by itself

In [66]:
%%time
# Process documents and keep only the english ones.
fileContentsListEnglish = []
fileCount = 1
noFilesToProcess = 100000

for file in fileContentsList:
    try:
        fileJSON = json.loads(file)

        if detect(fileJSON["title"]) == 'en':
            fileContentsListEnglish.append(file)
            
            if fileCount % (noFilesToProcess/10) == 0:
                print(f'File No.: {fileCount} - Year: {fileJSON["year"]} - Title: {fileJSON["title"]}')

            fileCount += 1   # only counting for english documents.
            
    except Exception as e:
        print(f'File No.: {fileCount} - ERROR: {e}')
        
    if fileCount == noFilesToProcess+1:
        break

File No.: 1962 - ERROR: No features in text.
File No.: 3408 - ERROR: No features in text.
File No.: 3591 - ERROR: No features in text.
File No.: 7111 - ERROR: No features in text.
File No.: 9483 - ERROR: No features in text.
File No.: 10000 - Year: 1989 - Title: Epidemiological approaches for identifying risk factors in human congenital malformations : Abstracts of Papers Presented at the 29th Annual Meeting of the Japanese Teratology Society, Yamagata, Japan July 13 - 14
File No.: 10129 - ERROR: No features in text.
File No.: 13320 - ERROR: No features in text.
File No.: 16084 - ERROR: No features in text.
File No.: 17267 - ERROR: No features in text.
File No.: 17351 - ERROR: No features in text.
File No.: 19752 - ERROR: No features in text.
File No.: 20000 - Year: 2009 - Title: Indian Adaptations in Flooded Regions of South America: Introduction
File No.: 22540 - ERROR: No features in text.
File No.: 24989 - ERROR: No features in text.
File No.: 25199 - ERROR: No features in text.
Fi

In [81]:
%%time
for file in fileContentsListEnglish:
    
    fileJSON = json.loads(file)
    
    keyTermsMatched = set(fileJSON["title"].lower().split(' ')).intersection(set(keyTermsList))
    
    if (0 < len(keyTermsMatched)) :
        print(f'\nDOC MATCHED: {fileJSON["title"]}\n')
    else:
#         print(fileJSON["title"])
        pass


DOC MATCHED: An Effective Joint Framework for Document Summarization


DOC MATCHED: Developments in Physical Chemistry and Basic Principles of Extractive Metallurgy in 1980


DOC MATCHED: Political and economic implications of extractive industries


DOC MATCHED: Belief as summarization and meta-support


DOC MATCHED: Effect of Sea Cucumber Extractive on Erythrocyte Membrane Fluidity in Rats with Overtraining Syndrome


DOC MATCHED: Automatic Generation of Summeries for the Web


DOC MATCHED: A video summarization approach based on the emulation of bottom-up mechanisms of visual attention


DOC MATCHED: Summarization Experiments in DUC 2004


DOC MATCHED: QA@INEX Track 2011: Question Expansion and Reformulation Using the REG Summarization System


DOC MATCHED: Summarization of news speech with unknown topic boundary


DOC MATCHED: Attend to the beginning: A study on using bidirectional attention for extractive summarization


DOC MATCHED: Topical Summarization on the Mayor and Schoolm

## SciBERT

- SciBERT: https://github.com/allenai/scibert
- Notebook: https://github.com/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb

In [32]:
import torch
from transformers import *

In [26]:
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=313.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…




In [33]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x22c25bcd860>

In [34]:
# Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties. 
tokens = tokenizer.tokenize("Complementary Dual-Contact Switch Using Soft and Hard Contact Materials for Achieving Low Contact Resistance and High Reliability Simultaneously")
print("Tokens: {}".format(tokens))

# This is not sufficient for the model, as it requires integers as input, 
# not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))

# Now we're ready to go through BERT with out input
outputs, pooled = model(tokens_pt)
print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))

Tokens: ['complementary', 'dual', '-', 'contact', 'switch', 'using', 'soft', 'and', 'hard', 'contact', 'materials', 'for', 'achieving', 'low', 'contact', 'resistance', 'and', 'high', 'reliability', 'simultaneously']
Tokens id: [8487, 4793, 579, 3585, 6216, 487, 1720, 137, 2723, 3585, 2518, 168, 9153, 629, 3585, 2661, 137, 597, 4817, 5364]
Tokens PyTorch: tensor([[ 102, 8487, 4793,  579, 3585, 6216,  487, 1720,  137, 2723, 3585, 2518,
          168, 9153,  629, 3585, 2661,  137,  597, 4817, 5364,  103]])
Token wise output: torch.Size([1, 22, 768]), Pooled output: torch.Size([1, 768])


In [39]:
pooled

tensor([[-0.0416, -0.1480, -0.6753,  0.8675,  0.2088,  0.9992,  0.2641, -0.9997,
          0.2043,  0.7729,  0.1298,  0.5405,  0.7846,  0.7665, -0.4069, -0.0907,
         -0.5304, -0.4133,  0.7156, -0.9106, -0.1457,  0.5916, -0.5221,  0.6518,
          0.1237,  0.4443, -0.9847, -0.1341, -0.2130,  0.4250,  0.4813, -0.9459,
          0.4612, -0.3520, -0.3998,  0.0332,  0.9989,  0.2931,  0.7156,  0.0493,
         -0.5442,  0.2990,  0.4944, -0.4461,  0.5150, -0.0930, -0.4712,  0.3879,
         -0.6986,  0.9627,  0.0752, -0.3971, -0.1057,  0.5300,  0.9873,  0.0485,
         -0.2699,  0.3184, -0.6361,  0.5775,  0.0217, -0.9774,  0.4412,  0.5084,
          0.5062, -0.4235, -0.2550, -0.3205,  0.9495,  0.2289,  0.9662,  0.3846,
          0.1071, -0.7250,  0.9554,  0.0102, -0.0994, -0.3838,  0.0567,  0.1527,
          0.2297,  0.1733,  0.0352, -0.9719, -0.5161, -0.4337, -0.0113, -0.3069,
          0.9672, -0.7773, -0.4541,  0.0539, -0.4561,  0.0442, -0.4799,  0.2286,
         -0.5568,  0.1368,  

In [35]:
# tokens = tokenizer.tokenize("This is an input example")
# tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
# tokens_pt = torch.tensor([tokens_ids])

# This code can be factored into one-line as follow
tokens_pt2 = tokenizer.encode_plus("Complementary Dual-Contact Switch Using Soft and Hard Contact Materials for Achieving Low Contact Resistance and High Reliability Simultaneously", return_tensors="pt")

for key, value in tokens_pt2.items():
    print("{}:\n\t{}".format(key, value))

outputs2, pooled2 = model(**tokens_pt2)
print("Difference with previous code: ({}, {})".format((outputs2 - outputs).sum(), (pooled2 - pooled).sum()))

input_ids:
	tensor([[ 102, 8487, 4793,  579, 3585, 6216,  487, 1720,  137, 2723, 3585, 2518,
          168, 9153,  629, 3585, 2661,  137,  597, 4817, 5364,  103]])
token_type_ids:
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Difference with previous code: (0.0, 0.0)
