# Import Dependencies

In [1]:
import json
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from collections import Counter
from bert_embedding import BertEmbedding
from operator import itemgetter
import tensorflow as tf
import tensorflow_hub as hub
import spacy
import gensim.downloader as api
from nltk.corpus import wordnet
import num2words

# Load Data

In [2]:
Title = []
Abstract = []
Url = []

path = 'C://Users//Abubakar//Desktop//articles//'
file_name = '_Article.json'

for i in range(1, 11):
    with open(path+str(i)+file_name, encoding="utf8") as json_file:
        temp_data = json.load(json_file)
        for j in temp_data:
            temp_title = j['Title']
            temp_abstract = j['Abstract']
            Title.append(temp_title)
            Abstract.append(temp_abstract)
            
df = pd.DataFrame()
df['Title'] = Title
df['Abstract'] = Abstract
df

Unnamed: 0,Title,Abstract
0,Rhombic ZnO nanosheets modified with Pd nanopa...,The rhombic ZnO nanosheets were prepared via a...
1,The efficient mixed matrix antifouling membran...,Membrane technology has raised considerable in...
2,Three-dimensional carbonate reservoir geomodel...,To better know the spatial distribution and ar...
3,Development of Pr2-xSrxCuO4±δ mixed ion-electr...,Mixed ionic-electronic conducting oxides Pr2-x...
4,Comparison of methods for preparation of 125I ...,Two procedures for fixing the 125I activity on...
...,...,...
44861,Production and optimization of high grade cell...,Production of high grade cellulolytic enzymes ...
44862,Feasibility of acetone–butanol–ethanol ferment...,The economic feasibility of acetone–butanol–et...
44863,Index,
44864,Maximizing renewable hydrogen production from ...,Biological production of hydrogen from biomass...


# Data Cleaning Pipeline


In [3]:
def remove_stop_words(data):
    # Tokenize the input text and remove stopwords from the corpus
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 3:
            new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text

def remove_punctuation(data):
    # Remove punctuations defined below from input text
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    # Remove apostrophe from the input text
    return np.char.replace(data, "'", "")

def convert_numbers(data):
    # Convert numbers to text form in input text
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def get_bigrams(text):
    
    """Input
    ----------
    text : str or list of strings
    n    : number of word in each combination string ie if n = 2 the tokenization will happen in two word pairs
    
    Output
    -------
    tokens : The output would be a list of lists and each element list of the list will contain
             unigram and n_gram tokens. This functions can be modified for a range of grams but right now
             it will be best to use it with n = 2.
    """
    text = preprocess(text)
    bi_grams = ngrams(word_tokenize(text), 2)
    unigrams = word_tokenize(text)
    bigrams = [' '.join(grams) for grams in bi_grams]
    tokens = unigrams + bigrams
    return tokens

def preprocess(data):
    # Preprocess the input text
    data = data.lower()
    #data = remove_punctuation(data) #remove comma seperately
    #data = remove_apostrophe(data)
    data = remove_stop_words(data)
    #data = convert_numbers(data)
    #data = remove_punctuation(data)
    #data = convert_numbers(data)
    #data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    #data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

# Preprocess the Data for NER Filter

In [4]:
def get_tokens(dataframe, column):
    tokens = []
    for i in dataframe[column]:
        tokens.append(get_bigrams(i))
    return tokens

In [5]:
titles = get_tokens(df, "Title")
abstracts = get_tokens(df, "Abstract")

# Filter the Articles if there's a Geopolitical Entity

We can filter articles on the basis of any type of named entity provided by SpaCy's NER API

In [6]:
def nlp_filter_regional(dataframe, column, NER_tag):
    '''Given data frame and column of the data frame this function outputs the indices of articles that do not have
        the name of any goepolical entity in it.
       
       Input
       -------
       dataframe    : A pandas data frame.
       column       : Specific column of the data frame.
       
       Output
       -------
       indices      : A list of indices of articles that does not have have any geoplolictacl entity in them.'''
    
    nlp = spacy.load('en_core_web_sm') # python -m spacy download en_core_web_sm
    indices = []
    doc_ents = []
    trashed_docs_indices = []
    for i in range(len(dataframe[column])):
        document = dataframe.loc[i, column]
        document = remove_stop_words(document)
        document = word_tokenize(document)
        _doc_ents = []
        for word in document:
            word = nlp(word)
            for entity in word.ents:
                _doc_ents.append(entity.label_)
        doc_ents.append(_doc_ents)

    for i in range(len(dataframe[column])):
        doc = doc_ents[i]
        if NER_tag not in doc:
            indices.append(i)
            
        else:
            trashed_docs_indices.append(i)
    
    title = []
    abstract = []
    for i in indices:
        title.append(df.loc[i, 'Title'])
        abstract.append(df.loc[i, 'Abstract'])
    data = pd.DataFrame()
    data['Title'] = title
    data['Abstract'] = abstract
    
    
    title = []
    abstract = []
    for i in trashed_docs_indices:
        title.append(df.loc[i, 'Title'])
        abstract.append(df.loc[i, 'Abstract'])
    trashed_data = pd.DataFrame()
    trashed_data['Title'] = title
    trashed_data['Abstract'] = abstract
    return data, trashed_data


In [7]:
filtered_docs, trashed_docs = nlp_filter_regional(df[:100], "Title", "GPE")

In [18]:
trashed_docs["Title"][4]

'Algae community response to climate change and nutrient loading recorded by sedimentary phytoplankton pigments in the Changtan Reservoir, China'