<a href="https://colab.research.google.com/github/DTU-Projects/GT-Project/blob/main/Keyword_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run it yourself: <br> https://colab.research.google.com/drive/1j8aF_AksK3YT_rfqbyPyNdYgf2nQIo-M?usp=sharing
<br>
Authors: Udit Chauhan and Sidharth

In [7]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [15]:
text = '''
Prime Minister Narendra Modi launched the Ayushman Bharat Health Infrastructure Mission, one of the largest pan-India schemes for strengthening healthcare infrastructure, in his parliamentary constituency Varanasi in Uttar Pradesh on Monday.

The Prime Minister also inaugurated various development projects worth more than ₹5,200 crore for his constituency.

The Pradhan Mantri Ayushman Bharat Health Infrastructure Mission is one of the largest pan-India schemes for strengthening healthcare infrastructure across the country. It is in addition to the National Health Mission.

Its objective is to fill gaps in public health infrastructure, especially in critical care facilities and primary care in both urban and rural areas. It will provide support for 17,788 rural health and wellness centres in 10 high-focus states. Further, 11,024 urban health and wellness centres will be established in all the States.


Through this, critical care services will be available in all the districts of the country with more than five lakh population through exclusive critical care hospital blocks, while the remaining districts will be covered through referral services.


People will have access to a full range of diagnostic services in the public healthcare system through a network of laboratories across the country, and integrated public health labs will be set up in all the districts.

Under the scheme, a national institution for one health, four new national institutes for virology, a regional research platform for WHO South East Asia Region, nine biosafety level-III laboratories, five new regional national centre for disease control will be set up.
'''
tr=TextRank4Keyword()
tr.analyze(text, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
tr.get_keywords(10)

health - 2.841356677466052
care - 1.939419843626815
country - 1.7942668856339528
healthcare - 1.7361309140956975
infrastructure - 1.7069148707640296
laboratories - 1.6792358268334229
services - 1.6618475644014108
Health - 1.4379300022893773
constituency - 1.4240540436860147
districts - 1.3674166586618508
Ayushman - 1.3040098773911275
Bharat - 1.2888257923789173
