# Import Dependencies 

In [1]:
import json
import pandas as pd
import numpy as np
import json
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from collections import Counter
from bert_embedding import BertEmbedding
from operator import itemgetter
import tensorflow as tf
import tensorflow_hub as hub
import spacy
import gensim.downloader as api
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load Data

In [2]:
Title = []
Abstract = []
Url = []

path = 'C://Users//Abubakar//Desktop//articles//'
file_name = '_Article.json'

for i in range(1, 11):
    with open(path+str(i)+file_name, encoding="utf8") as json_file:
        temp_data = json.load(json_file)
        for j in temp_data:
            temp_title = j['Title']
            temp_abstract = j['Abstract']
            Title.append(temp_title)
            Abstract.append(temp_abstract)
            
df = pd.DataFrame()
df['Title'] = Title
df['Abstract'] = Abstract
df

Unnamed: 0,Title,Abstract
0,Rhombic ZnO nanosheets modified with Pd nanopa...,The rhombic ZnO nanosheets were prepared via a...
1,The efficient mixed matrix antifouling membran...,Membrane technology has raised considerable in...
2,Three-dimensional carbonate reservoir geomodel...,To better know the spatial distribution and ar...
3,Development of Pr2-xSrxCuO4±δ mixed ion-electr...,Mixed ionic-electronic conducting oxides Pr2-x...
4,Comparison of methods for preparation of 125I ...,Two procedures for fixing the 125I activity on...
...,...,...
44861,Production and optimization of high grade cell...,Production of high grade cellulolytic enzymes ...
44862,Feasibility of acetone–butanol–ethanol ferment...,The economic feasibility of acetone–butanol–et...
44863,Index,
44864,Maximizing renewable hydrogen production from ...,Biological production of hydrogen from biomass...


# Define Data Cleaning Pipeline

In [3]:
def remove_stop_words(data):
    # Tokenize the input text and remove stopwords from the corpus
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 3:
            new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text

def remove_punctuation(data):
    # Remove punctuations defined below from input text
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    # Remove apostrophe from the input text
    return np.char.replace(data, "'", "")

def convert_numbers(data):
    # Convert numbers to text form in input text
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def get_bigrams(text):
    
    """Input
    ----------
    text : str or list of strings
    n    : number of word in each combination string ie if n = 2 the tokenization will happen in two word pairs
    
    Output
    -------
    tokens : The output would be a list of lists and each element list of the list will contain
             unigram and n_gram tokens. This functions can be modified for a range of grams but right now
             it will be best to use it with n = 2.
    """
    text = preprocess(text)
    bi_grams = ngrams(word_tokenize(text), 2)
    unigrams = word_tokenize(text)
    bigrams = [' '.join(grams) for grams in bi_grams]
    tokens = unigrams + bigrams
    return tokens

def preprocess(data):
    # Preprocess the input text
    data = data.lower()
    #data = remove_punctuation(data) #remove comma seperately
    #data = remove_apostrophe(data)
    data = remove_stop_words(data)
    #data = convert_numbers(data)
    #data = remove_punctuation(data)
    #data = convert_numbers(data)
    #data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    #data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [4]:
def get_tokens(dataframe, column):
    tokens = []
    for i in dataframe[column]:
        tokens.append(get_bigrams(i))
    return tokens

# Define TFIDF Algorithm

In [5]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [6]:
def tfidf_scores(abstract, title):
    
    """given body and titles of the articles this funtion calculates the tfidf scores of the words in the text corpus.
       
       Input
       -------
       body         : Body or abstracts of the articles.
       title        : Titles of the articles
       
       Output
       -------
       tf_idf       : A dictionary of tf_idf scorese of the vocabulary.
       """
   
    
    N = len(abstract)
    DF = {}

    for i in range(N):
        tokens = abstract[i]
        for w in tokens:
            try:
                DF[w].add(i)
            except:
                DF[w] = {i}
    for i in DF:
        DF[i] = len(DF[i])
    
    total_vocab = [x for x in DF]

    doc = 0

    tf_idf = {}

    for i in range(N):
    
        tokens = abstract[i]
    
        counter = Counter(tokens + title[i])
        words_count = len(tokens + title[i])
    
        for token in np.unique(tokens):
        
            tf = counter[token]/words_count
            df = doc_freq(token)
            idf = np.log((N+1)/(df+1))
        
            tf_idf[doc, token] = tf*idf

        doc += 1
    return tf_idf

In [7]:
def tfidf_abstract_title_filter(query, abstract, title):
    """given the query, body of articles, titles of articles and tfifd_method
      this funtion extract k number of articles that are most relevent to the query keywords.
       
       Input
       -------
       body         : Body or abstracts of the articles.
       title        : Titles of the articles
       
       Output
       -------
       tf_idf       : A dictionary of tf_idf scorese of the vocabulary."""
    
    
    tokens = get_bigrams(query)
    
    tf_idf = tfidf_scores(abstract, title)
    
    relevent_indices = []
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)    
    
    for i in query_weights:
        relevent_indices.append(i[0])

        
    return relevent_indices

# Apply K Means Clustering to Get Keywords for Categories

By applying the K Means Clustering we can get the keywords that are most common in each cluster, later we can use these keywords to assign a class to each cluster and use these classes to label the unlabeled data.

In [8]:
titles = get_tokens(df, "Title")
abstracts = get_tokens(df, "Abstract")

In [9]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Abstract'])

n_clusters = 3
model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=3, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [10]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(n_clusters):
 print('Cluster %d:' % i),
 for ind in order_centroids[i, 0:3]:
        print('              %s' % terms[ind])
        

Cluster 0:
              lipid
              biomass
              microalgae
Cluster 1:
              production
              energy
              biomass
Cluster 2:
              algae
              water
              production


# Data Labeling Using Keywords from Cluster and TFIDF Similarity

Using the keywords in the clusters we can define categories and use these categories as queries in TFIDF Algorithm, the articles that are most similar to a certain category would be assigned that category.

clusters = ['Algae production in Water', 'Production of Biomass Energy', 'Liquid Biomass and Micro Algea']

In [36]:
from tqdm import tqdm_notebook
def get_labels_and_indices(n_cluster, centroid_keywords):
    relevent_indices = []
    labels = []
    for i in tqdm_notebook(range(len(clusters))):
        for cluster in clusters:
            query = cluster
            _relevent_indices = tfidf_abstract_title_filter(query, abstracts, titles)
            for i in _relevent_indices:
                if i not in relevent_indices:
                    relevent_indices.append(i)
                    labels.append(query)
    title = []
    abstract = []
    for i in relevent_indices:
        title.append(df.loc[i, 'Title'])
        abstract.append(df.loc[i, 'Abstract'])
    data = pd.DataFrame()
    data['Title'] = title
    data['Abstract'] = abstract
    data['Category'] = labels 
    return data 

labeled_data = get_labels_and_indices(3, order_centroids)
labeled_data

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0,Title,Abstract,Category
0,The potential of optimized process design to a...,Environmental impact is an essential aspect fo...,Algae production in Water
1,The potential of optimized process design to a...,Environmental impact is an essential aspect fo...,Algae production in Water
2,The potential of optimized process design to a...,Environmental impact is an essential aspect fo...,Algae production in Water
3,Sterols from green and blue-green algae grown ...,"Two green algae,Scenedesmus sp. andChlorella v...",Algae production in Water
4,Sterols from green and blue-green algae grown ...,"Two green algae,Scenedesmus sp. andChlorella v...",Algae production in Water
...,...,...,...
33808,Chapter 1: Advancement of Metabolomics Techniq...,Metabolomics is the study of the whole metabol...,Liquid Biomass and Micro Algea
33809,Development of a simple and efficient method o...,Chemical isotope labeling (CIL) liquid chromat...,Liquid Biomass and Micro Algea
33810,Production of biofuels by thermal catalytic cr...,"In this work, the residual fat material (scum)...",Liquid Biomass and Micro Algea
33811,State-of-the-art on detoxification of Jatropha...,Jatropha curcas seeds contain 250–300 g oil/kg...,Liquid Biomass and Micro Algea
