<a href="https://colab.research.google.com/github/AnupJoseph/NLP/blob/master/KeywordExtractionUsingPagerank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
nlp = spacy.load('en_core_web_sm')

In [None]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in candidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [None]:
tr4w = TextRank4Keyword()

In [None]:
i = 0
with open('data.txt') as filename:
  for line in filename:
    if i>10:
      break
    line = line.strip('\n')
    print(line)
    tr4w.analyze(line, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    tr4w.get_keywords(10)
    print('\n')
    i+=1

Robotics. RoboRobotics deals with the design, construction, operation, and use of robots, as well as computer systems for their control, sensory feedback, and information processing. These technologies are used to develop machines that can substitute for humans and replicate human actions.  students will be able to understand how robots are made and their functionality .They can also pursue doing research in it
robots - 1.7886898148148145
control - 1.1768865740740742
operation - 1.1688587962962962
systems - 1.1549675925925924
use - 1.1469398148148147
computer - 1.1456805555555558
construction - 1.024949074074074
technologies - 1.0
machines - 1.0
humans - 1.0
actions - 1.0
feedback - 0.9865416666666667


Personality Development and Communication Skill. Develop your own identity!!!   **Personality development**  is the relatively enduring pattern of the thoughts, feelings, and behaviours that distinguish individuals from one another. The dominant view in the field of personality psycholo

In [None]:
from summa import keywords
i = 0
with open('data.txt') as filename:
  for line in filename:
    if i>10:
      break
    line = line.strip('\n')
    print(line)
    print(keywords.keywords(line))
    print('\n')
    i+=1

Robotics. RoboRobotics deals with the design, construction, operation, and use of robots, as well as computer systems for their control, sensory feedback, and information processing. These technologies are used to develop machines that can substitute for humans and replicate human actions.  students will be able to understand how robots are made and their functionality .They can also pursue doing research in it
roborobotics deals
robotics
robots


Personality Development and Communication Skill. Develop your own identity!!!   **Personality development**  is the relatively enduring pattern of the thoughts, feelings, and behaviours that distinguish individuals from one another. The dominant view in the field of personality psychology today holds that personality emerges early and continues to change in meaningful ways throughout the lifespan.  Adult personality traits are believed to have a basis in infant [temperament](https://en.wikipedia.org/wiki/Temperament "Temperament"), meaning th