# Extract Abstracts of Academic Articles

Topics/Keywords: climate change, environment, sustainability, pollution, global warming, sea level rise, climate

## Import Packages

In [3]:
import Config as CON
import os
import sys
import random
import multiprocessing as mp
import ujson as json
import time
import numpy as np
import math
import csv
import matplotlib.pyplot as plt
import string
from langdetect import detect
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize

#nltk.download('tagsets')

#Declare some necessary global functions and objects
current_time_ms = lambda: int(round(time.time() * 1000))
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## Normalize any Keyword

When we try to match keywords, we try to match them in their normalized form

In [2]:
def normalized_keyword(keyword):
    """
    Given a keyword, convert it to a normalized form so that it can be compared with others in a more robust way
    keyword: string
    output: string
    """
    keyword = keyword.lower()
    #Remove punctuations
    text=keyword.translate((str.maketrans('','',string.punctuation)))
    #Tokenize
    text_tokens = word_tokenize(text)
    #Remove stopwords
    text_tokens = [word for word in text_tokens if not word in stop_words]
    #Lemmatize tokens
    text_tokens = [lemmatizer.lemmatize(text_token) for text_token in text_tokens]
    
    norm_keyword = ""
    for token in text_tokens:
        norm_keyword +=token
    return norm_keyword

In [3]:
#How to apply normalization

print("Punctuations to be removed: ")
print(string.punctuation)
print("----------------------------\n")

keyword = "Machine Learning"
print(normalized_keyword(keyword))
keyword = "Human-Computer-Interaction"
print(normalized_keyword(keyword))
keyword = "Mig#23"
print(normalized_keyword(keyword))
keyword = "learning, interaction, and application"
print(normalized_keyword(keyword))

#POS-tagging
print(nltk.pos_tag(['I','eat','rice','while','playing','video','games']))

#Details about the POS tags can be found here
#nltk.help.upenn_tagset()

Punctuations to be removed: 
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
----------------------------

machinelearning
humancomputerinteraction
mig23
learninginteractionapplication
[('I', 'PRP'), ('eat', 'VBP'), ('rice', 'NN'), ('while', 'IN'), ('playing', 'VBG'), ('video', 'NNS'), ('games', 'NNS')]


## Define Search Keywords

In [4]:
search_keys = ["climate change", "sustainability", "pollution", "global warming", "sea level rise", "climate", "water stress", "coastal flooding"]
norm_search_keys = [normalized_keyword(key) for key in search_keys]
print(norm_search_keys)

['climatechange', 'sustainability', 'pollution', 'globalwarming', 'sealevelrise', 'climate', 'waterstress', 'coastalflooding']


## Filereader for Multiprocessing Environement

We want to process each input files independently first. Our goal is to produce an independent list of abstracts that match search keywords, considering only the local file.

<b>Input</b>: filename (absolute path), pid (process id for managing temporary files that will be used to merge the outputs)

<b>Output</b>: Tuples (keywords, abstract) in a json file <u>"tempAbstracts_[pid].json"</u>; total_papers: number of papers in this file; papers_with_keywords: number of papers that have matching keywords

<b>JSON object</b>: {keywords: ["a","b"], abstract: "this will contain the paper abstract"}

In [5]:
def readOAGPaper(file_absolute_path, pid):
    start = current_time_ms()
    filename = file_absolute_path
    outfilename = CON.TEMP_DATA_DIRECTORY + "tempAbstracts_"+str(pid)+".json"
    
    abstracts = []
    #norm_keyword_to_original = what is the human readable form of the normalized_keyword?
    norm_keyword_to_original = {}
    
    total_papers = 0
    papers_with_keywords = 0
    
    f = open(filename,mode="r",encoding="utf-8")
    lineNo = 0
    
    for line in f:
        line = line.strip()
        lineNo +=1
        
        #Comment these two lines when running in server
        #if lineNo == 5001:
        #    break
            
        paper = json.loads(line)
        attrs = paper.keys()
            
        #Discard papers of foreign language
            
        #Language is already detected in the dataset -- accuracy is low.
        #if 'lang' in attrs:
        #    if paper['lang']!='en':
        #        print(paper['abstract'])
        #        continue
            
        #Try to detect language from abstract
        if 'abstract' in attrs:
            try:
                lang = detect(paper['abstract'])
                    
                if lang !='en':
                    #Skip the paper if the language of abstract is not english
                    continue
            except:
                #Skip the paper if the language of abstract cannote be detected
                continue
        else:
            #Skip the paper if the abstract is not available
            continue
        
        if len(paper['abstract'])<100 or len(paper['abstract'])>5000:
            continue
        
        ##Language test passed
        total_papers +=1
        matched = False
        if 'keywords' in attrs:
            for keyword in paper['keywords']:
                if matched:
                    break
                    
                norm_keyword = normalized_keyword(keyword)
                
                for search_key in norm_search_keys:
                    if search_key in norm_keyword:
                        json_object = {}
                        json_object["keywords"] = paper['keywords']
                        json_object["abstract"] = paper['abstract']
                        papers_with_keywords +=1
                        abstracts.append(json_object)
                        matched = True
                        break
                    #
                #
            #
        #
    
    f.close()
    
    #Print the list of abstracts in temporary json files
    #TO-DO
    #print(abstracts)
    with open(outfilename, 'w') as fout:
        json.dump(abstracts, fout)
    
    end = current_time_ms()
    
    print("Processing time [Process %d]: %d ms"%(pid, end-start))
    
    return (total_papers, papers_with_keywords)

## Multiprocessing Control Block

In [6]:
if __name__ == '__main__': 
    global_start = current_time_ms()
    
    if not os.path.exists(CON.TEMP_DATA_DIRECTORY):
        os.mkdir(CON.TEMP_DATA_DIRECTORY)
    
    #Define number of parallel processes
    num_workers = CON.NUM_POOLS
    #Each new worker will be assigned a pid (auto-incremented)
    pid = 0
    
    #List OAG Paper Files
    print("Reading OAG Paper Files")
    print("-------------------------")
    CD = CON.INPUT_DATA_DIRECTORY + "OAG_Papers"
    oag_paper_files = [ (CD+"/"+f) for f in os.listdir(CD) if os.path.isfile(CD+"/"+f)]
    print("Number of files: %d"%(len(oag_paper_files)))
    
    temp_abstract_files = [CON.TEMP_DATA_DIRECTORY+"tempAbstracts_"+str(i)+".json" for i in range(0,len(oag_paper_files))]
    
    temp_files = temp_abstract_files
    #Clear temporary files created by previous instance of this code
    for filename in temp_files:
        if os.path.exists(filename):
            print("Deleting "+filename)
            os.remove(filename)
    
    total_papers = 0
    papers_with_keywords = 0
    pool = mp.Pool()
    num_rounds = math.ceil(len(oag_paper_files)/num_workers)
    last_round_files = len(oag_paper_files) - (num_rounds-1)*num_workers
    
    for round_no in range(0,num_rounds-1):
        pool_results = []
        for w in range(0, num_workers):
            index = round_no*num_workers + w
            pool_results.append(pool.apply_async(readOAGPaper,args=(oag_paper_files[index], index,)))
    
        #Wait for the termination
        for pool_result in pool_results:
            (tp, pwk) = pool_result.get()
            total_papers +=tp
            papers_with_keywords +=pwk
        
        pool_results.clear()
        
    pool_results = []
    for w in range(0, last_round_files):
        index = (num_rounds-1)*num_workers + w
        pool_results.append(pool.apply_async(readOAGPaper,args=(oag_paper_files[index],index,)))
        
    #Wait for the termination
    for pool_result in pool_results:
        (tp, pwk) = pool_result.get()
        total_papers +=tp
        papers_with_keywords +=pwk
        
    pool_results.clear()
    
    print("All sub-processes terminated")
    print("Total papers: %d"%(total_papers))
    print("Papers with keywords: %d"%(papers_with_keywords))

Reading OAG Paper Files
-------------------------
Number of files: 15
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_0.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_1.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_2.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_3.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_4.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_5.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_6.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_7.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_8.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_9.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_10.json
Deleting /fred/oz130/comm_search/trend-analysis/Temp/tempAbstracts_11.json
Deleting /fred/oz130/comm_search/trend-a

## Read Back and Combine Results

In [8]:
assert(os.path.exists(CON.TEMP_DATA_DIRECTORY))
all_abstracts = []
count = 0
for filename in os.listdir(CON.TEMP_DATA_DIRECTORY):
    with open(os.path.join(CON.TEMP_DATA_DIRECTORY, filename), encoding='utf-8') as data_file:
        data = json.loads(data_file.read())
        all_abstracts.extend(data)
        count += len(data)
    
if not os.path.exists(CON.OUTPUT_DATA_DIRECTORY):
    os.mkdir(CON.OUTPUT_DATA_DIRECTORY)

output_filename = os.path.join(CON.OUTPUT_DATA_DIRECTORY,"all_abstracts_with_keywords.json")

with open(output_filename, 'w') as fout:
    json.dump(all_abstracts, fout)
    
print("Dataset collected successfully")
print("Number of articles %d\n"%len(all_abstracts))

Dataset collected successfully
Number of articles 234

