# Import Dependencies

In [1]:
import json
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from collections import Counter
from bert_embedding import BertEmbedding
from operator import itemgetter
import tensorflow as tf
import tensorflow_hub as hub
import spacy
import gensim.downloader as api
from nltk.corpus import wordnet

# Load Data

In [68]:
Title = []
Abstract = []
Url = []

path = 'C://Users//Abubakar//Desktop//articles//'
file_name = '_Article.json'

for i in range(1, 11):
    with open(path+str(i)+file_name, encoding="utf8") as json_file:
        temp_data = json.load(json_file)
        for j in temp_data:
            temp_title = j['Title']
            temp_abstract = j['Abstract']
            Title.append(temp_title)
            Abstract.append(temp_abstract)
            
df = pd.DataFrame()
df['Title'] = Title
df['Abstract'] = Abstract
df

Unnamed: 0,Title,Abstract
0,Rhombic ZnO nanosheets modified with Pd nanopa...,The rhombic ZnO nanosheets were prepared via a...
1,The efficient mixed matrix antifouling membran...,Membrane technology has raised considerable in...
2,Three-dimensional carbonate reservoir geomodel...,To better know the spatial distribution and ar...
3,Development of Pr2-xSrxCuO4±δ mixed ion-electr...,Mixed ionic-electronic conducting oxides Pr2-x...
4,Comparison of methods for preparation of 125I ...,Two procedures for fixing the 125I activity on...
...,...,...
44861,Production and optimization of high grade cell...,Production of high grade cellulolytic enzymes ...
44862,Feasibility of acetone–butanol–ethanol ferment...,The economic feasibility of acetone–butanol–et...
44863,Index,
44864,Maximizing renewable hydrogen production from ...,Biological production of hydrogen from biomass...


# Preprocessing Pipeline

In [51]:
def remove_stop_words(data):
    # Tokenize the input text and remove stopwords from the corpus
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 3:
            new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text

def remove_punctuation(data):
    # Remove punctuations defined below from input text
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    # Remove apostrophe from the input text
    return np.char.replace(data, "'", "")

def convert_numbers(data):
    # Convert numbers to text form in input text
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def get_bigrams(text):
    
    """Input
    ----------
    text : str or list of strings
    n    : number of word in each combination string ie if n = 2 the tokenization will happen in two word pairs
    
    Output
    -------
    tokens : The output would be a list of lists and each element list of the list will contain
             unigram and n_gram tokens. This functions can be modified for a range of grams but right now
             it will be best to use it with n = 2.
    """
    text = preprocess(text)
    bi_grams = ngrams(word_tokenize(text), 2)
    unigrams = word_tokenize(text)
    bigrams = [' '.join(grams) for grams in bi_grams]
    tokens = unigrams + bigrams
    return tokens

def preprocess(data):
    # Preprocess the input text
    data = data.lower()
    #data = remove_punctuation(data) #remove comma seperately
    #data = remove_apostrophe(data)
    data = remove_stop_words(data)
    #data = convert_numbers(data)
    #data = remove_punctuation(data)
    #data = convert_numbers(data)
    #data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    #data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

# Prprocess Data

In [61]:
def get_tokens(dataframe, column):
    tokens = []
    for i in dataframe[column]:
        tokens.append(get_bigrams(i))
    return tokens

# TFIDF Algorithm to Rank Articles Based on Search Keywords

This algorithms is designed such that it calculates a combined TFIDF score of both Title and Body of the an Article.

In [53]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [54]:
def tfidf_scores(abstract, title):
    
    """given body and titles of the articles this funtion calculates the tfidf scores of the words in the text corpus.
       
       Input
       -------
       body         : Body or abstracts of the articles.
       title        : Titles of the articles
       
       Output
       -------
       tf_idf       : A dictionary of tf_idf scorese of the vocabulary.
       """
   
    
    N = len(abstract)
    DF = {}

    for i in range(N):
        tokens = abstract[i]
        for w in tokens:
            try:
                DF[w].add(i)
            except:
                DF[w] = {i}
    for i in DF:
        DF[i] = len(DF[i])
    
    total_vocab = [x for x in DF]

    doc = 0

    tf_idf = {}

    for i in range(N):
    
        tokens = abstract[i]
    
        counter = Counter(tokens + title[i])
        words_count = len(tokens + title[i])
    
        for token in np.unique(tokens):
        
            tf = counter[token]/words_count
            df = doc_freq(token)
            idf = np.log((N+1)/(df+1))
        
            tf_idf[doc, token] = tf*idf

        doc += 1
    return tf_idf

In [62]:
def tfidf_abstract_title_filter(query, abstract, title,  k):
    """given the query, body of articles, titles of articles, k (output articles threshold) and tfifd_method
      this funtion extract k number of articles that are most relevent to the query keywords.
       
       Input
       -------
       body         : Body or abstracts of the articles.
       title        : Titles of the articles
       
       Output
       -------
       tf_idf       : A dictionary of tf_idf scorese of the vocabulary."""
    
    
    tokens = get_bigrams(query)
    
    tf_idf = tfidf_scores(abstract, title)
    
    relevent_indices = []
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)    
    
    for i in query_weights[:k]:
        relevent_indices.append(i[0])
        
    title = []
    abstract = []
    for i in relevent_indices:
        title.append(df.loc[i, 'Title'])
        abstract.append(df.loc[i, 'Abstract'])
    data = pd.DataFrame()
    data['Title'] = title
    data['Abstract'] = abstract

        
    return data

In [65]:
titles = get_tokens(df, "Title")
abstracts = get_tokens(df, "Abstract")

In [64]:
tfidf_abstract_title_filter("climate change", abstracts, titles,  10)

Unnamed: 0,Title,Abstract
0,Modelling impacts of climate change on arable ...,"Combining climate change, crop growth and crop..."
1,9: Expected effect of climate change on foulin...,"In the coming decades, the marine environment ..."
2,The mysteries of the diatoms,Understanding the physiology of these unique a...
3,Biomass sustainability and certification,The major challenges for humanity include ener...
4,Climate change as a threat to biodiversity: An...,Climate change and its consequences present on...
5,Major threats of pollution and climate change ...,Coastal zone is of great importance in the pro...
6,Mapping the impact of climate change on biomas...,Several climate parameters affect the growth o...
7,Comparison of chemical solvents for mitigating...,There is a growing concern about the effect of...
8,Determination of the life cycle climate change...,Life cycle assessments (LCAs) of algal biofuel...
9,Climate change and ocean acidification—Interac...,The possibilities for interactions between tox...


In [66]:
tfidf_abstract_title_filter("Genetic Engineering", abstracts, titles,  10)

Unnamed: 0,Title,Abstract
0,"Social, political, legal and ethical areas of ...",Biotechnology and genetic engineering are havi...
1,"Social, political, legal and ethical areas of ...",Biotechnology and genetic engineering are havi...
2,Engineering proteins to facilitate bioprocessing,Genetic engineering is now being applied to ai...
3,2: Basic Biochemistry,In this chapter the main biological molecules ...
4,Chapter 14: Genetic Approaches for Improving P...,Multiple microorganisms couple between fuel ox...
5,Genetic engineering of multispecies microbial ...,There is currently much interest in developing...
6,Synthetic Biology,Synthetic biology is an emerging discipline st...
7,3: Genetic Engineering Approaches for Trait De...,This chapter discusses genetic engineering of ...
8,Chapter 18: Role of Genetic Engineering in Bio...,Algae are considered as a potential third-gene...
9,Chapter 18: Role of Genetic Engineering in Bio...,Algae are considered as a potential third-gene...
