In [1]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from itertools import groupby
from operator import itemgetter
from pattern.text.en import singularize
import nltk

In [17]:
class Keyword_Extraction:
    
    def __init__(self,text):
        """
        
        Input:
                    text : Text data as a string
        
        """
        # Into lower case
        self.text = text.lower()
        self.stop_words = set(stopwords.words("english"))
        
        
    def preprocess(self,pos = True,clean_html=False,singularize_nouns = False):
        """Input :
                    pos : If True, only nouns and adjectives are included in the words list
                          If False, all words are included 
                    
                    clean_html : If True, removes HTML tags (only when BeautifulSoup is used to get text from url)
                    
                    singularize_nouns : If true, convert plural nouns to singular
           Output :
                    None
        """
        if clean_html:
            self.text = nltk.clean_html(self.text)
        # Removing Punctuations and splitting the text
        self.tokenized_words = np.array(re.sub(r'[^\w\s]', ' ', self.text).split())
        # Removing the stop words
        self.words = np.array([w for w in self.tokenized_words if w not in self.stop_words])
        pos_tagged_words = nltk.pos_tag(self.words)
        if pos:
            # Including only nouns and adjacents
            self.words = np.array([w for (w,p) in pos_tagged_words if p.startswith("N") or p.startswith("J")])
            
        if singularize_nouns:
            self.words = np.array([singularize(word) for word in self.words])
        
        
    def build(self,tol=0.0001,max_iter=50,d=0.85,n_keywords=20,window=2):
        
        """ Input:
                    tol : Tolerance for stopping criteria. Default : 0.0001
                    max_iter : Maximum number of iterations taken for the solvers to converge. Default : 50
                    d = Damping factor. Should be in the range 0 to 1 . Default : 0.85
                    n_keywords : Total number of words to extract.
                    window : If window = 2, two words on the left and two words
                            on the right are considered for creating edges in the graph
                            
            Output:
                    keywords : list of keywords (format : numpy.ndarray)
        """
        # Each word as a vertix of graph
        vertices = np.array(list(set(self.words)),dtype="str")
        n_words = len(vertices)
        # Information regarding edges is stored in matrix 'Graph'
        Graph = np.zeros((n_words,n_words))
        
        for i in range(n_words):
            # Considering the words only in the window
            window_start = i-window if i-window >= 0 else 0 
            window_end = i+window if i+window < n_words else n_words-1 
            window_index = list(range(window_start,window_end+1))
            center_word = self.words[i]
            window_index.remove(i)
            window_words = self.words[window_index]
            # If a word j is in the window of word i, a edge created between those vertices
            for j in window_words:
                indx = np.where(vertices == j)[0][0]
                Graph[np.where(vertices==center_word)[0][0],indx] = 1
        # Initializing the score of each vertex to one      
        scores = np.ones(n_words)
        i = 0
        norm = 1
        # Iterating until convergence or maximum number of allowed times
        while norm > tol and i < max_iter:
            old_scores = list(scores)
            for j in range(n_words):
                # Get the list of vertices which had a edge with vertex j
                indices = np.where(Graph[j] == 1)[0]
                out = np.zeros(len(indices))
                for k in range(len(indices)):
                    # For each vertex k(which had an edge with j) counting total number of edges it had
                    out[k] = (Graph[indices[k]] > 0).sum()
                # Score of the vertex j is weighted average of scores of vertices with which it had edges
                # The weights are inverse of number of edges those vertices had
                update = (scores[indices]/out).sum()
                scores[j] = (1-d) + d*(update)
            norm = np.linalg.norm(old_scores-scores)
            i += 1  
        # Considering top N words
        result_index = np.argpartition(scores,-n_keywords)[-n_keywords:]
        self.unigram_keywords = vertices[result_index]
        # Index of keyword in the text
        index_keywrd_in_text = []
        for i in range(len(self.unigram_keywords)):
            to_extend = list(np.where(self.tokenized_words == self.unigram_keywords[i])[0])
            index_keywrd_in_text.extend(to_extend)
        # Sortng the indices
        sorted_ind_key_txt = np.array(sorted(index_keywrd_in_text))
        # iF the keywords are adjacent in the orignal text, considering them as single keyword
        # Example : if "machine" and "learning" are both in top N keywords and the are adjacent to each 
        # other in original text, then generating the new keyword as "machine learning"
        
        # Indices of the keywords which are adjacent in original text
        ngrams_index_list = []
        for k, g in groupby(enumerate(sorted_ind_key_txt), lambda ix : ix[0] - ix[1]):
            new_list = list(map(itemgetter(1), g))
            if len(new_list) > 1:
                ngrams_index_list.append(new_list)
        # Multiword keywords are stored in ngrams
        ngrams = []
        remove_words = []
        for i in range(len(ngrams_index_list)):
            words = self.tokenized_words[ngrams_index_list[i]]
            remove_words.extend(words)
            ngram = " ".join(words)
            ngrams.append(ngram)
        ngrams = list(self.filter_subsets(ngrams))
        self.ngrams = list(set(ngrams)) 
        # Removing those keywords which are included in multiwords
        remove_indx = np.in1d(self.unigram_keywords,remove_words)
        result_keywords = self.unigram_keywords[~remove_indx]
        self.keywords = np.concatenate((result_keywords,self.ngrams))
        return self.keywords

    def is_subset(self,l1,l2):
       """ Check if needle is ordered subset of haystack in O(n)  """

       if len(l2) < len(l1): return False

       index = 0
       for element in l1:
          try:
             index = l2.index(element, index) + 1
          except ValueError:
             return False
       else:
          return True

    def filter_subsets(self,lists):
        """ Given list of lists, return new list of lists without subsets  """

        for l1 in lists:
              if not any(self.is_subset(l1, l2) for l2 in lists if l1 is not l2):
                yield l1

In [3]:
data = """Junk foods taste good that’s why it is mostly liked by everyone of any age group especially kids and school going children. They generally ask for the junk food daily because they have been trend so by their parents from the childhood. They never have been discussed by their parents about the harmful effects of junk foods over health. According to the research by scientists, it has been found that junk foods have negative effects on the health in many ways. They are generally fried food found in the market in the packets. They become high in calories, high in cholesterol, low in healthy nutrients, high in sodium mineral, high in sugar, starch, unhealthy fat, lack of protein and lack of dietary fibers. Processed and junk foods are the means of rapid and unhealthy weight gain and negatively impact the whole body throughout the life. It makes able a person to gain excessive weight which is called as obesity. Junk foods tastes good and looks good however do not fulfil the healthy calorie requirement of the body. Some of the foods like french fries, fried foods, pizza, burgers, candy, soft drinks, baked goods, ice cream, cookies, etc are the example of high-sugar and high-fat containing foods. It is found according to the Centres for Disease Control and Prevention that Kids and children eating junk food are more prone to the type-2 diabetes. In type-2 diabetes our body become unable to regulate blood sugar level. Risk of getting this disease is increasing as one become more obese or overweight. It increases the risk of kidney failure. Eating junk food daily lead us to the nutritional deficiencies in the body because it is lack of essential nutrients, vitamins, iron, minerals and dietary fibers. It increases risk of cardiovascular diseases because it is rich in saturated fat, sodium and bad cholesterol. High sodium and bad cholesterol diet increases blood pressure and overloads the heart functioning. One who like junk food develop more risk to put on extra weight and become fatter and unhealthier. Junk foods contain high level carbohydrate which spike blood sugar level and make person more lethargic, sleepy and less active and alert. Reflexes and senses of the people eating this food become dull day by day thus they live more sedentary life. Junk foods are the source of constipation and other disease like diabetes, heart ailments, clogged arteries, heart attack, strokes, etc because of being poor in nutrition. Junk food is the easiest way to gain unhealthy weight. The amount of fats and sugar in the food makes you gain weight rapidly. However, this is not a healthy weight. It is more of fats and cholesterol which will have a harmful impact on your health. Junk food is also one of the main reasons for the increase in obesity nowadays.This food only looks and tastes good, other than that, it has no positive points. The amount of calorie your body requires to stay fit is not fulfilled by this food. For instance, foods like French fries, burgers, candy, and cookies, all have high amounts of sugar and fats. Therefore, this can result in long-term illnesses like diabetes and high blood pressure. This may also result in kidney failure. Above all, you can get various nutritional deficiencies when you don’t consume the essential nutrients, vitamins, minerals and more. You become prone to cardiovascular diseases due to the consumption of bad cholesterol and fat plus sodium. In other words, all this interferes with the functioning of your heart. Furthermore, junk food contains a higher level of carbohydrates. It will instantly spike your blood sugar levels. This will result in lethargy, inactiveness, and sleepiness. A person reflex becomes dull overtime and they lead an inactive life. To make things worse, junk food also clogs your arteries and increases the risk of a heart attack. Therefore, it must be avoided at the first instance to save your life from becoming ruined.The main problem with junk food is that people don’t realize its ill effects now. When the time comes, it is too late. Most importantly, the issue is that it does not impact you instantly. It works on your overtime; you will face the consequences sooner or later. Thus, it is better to stop now.You can avoid junk food by encouraging your children from an early age to eat green vegetables. Their taste buds must be developed as such that they find healthy food tasty. Moreover, try to mix things up. Do not serve the same green vegetable daily in the same style. Incorporate different types of healthy food in their diet following different recipes. This will help them to try foods at home rather than being attracted to junk food.In short, do not deprive them completely of it as that will not help. Children will find one way or the other to have it. Make sure you give them junk food in limited quantities and at healthy periods of time. """

In [35]:
ke = Keyword_Extraction(data)
ke.preprocess(pos=True,singularize_nouns=True)
keywords = ke.build(n_keywords=15,window=3)

In [36]:
keywords

array(['increase', 'person', 'body', 'weight', 'disease', 'good',
       'high level', 'high sugar', 'blood sugar level risk', 'high blood',
       'fat lack', 'high fat'], dtype='<U22')

In [37]:
ke.unigram_keywords

array(['increase', 'level', 'person', 'fat', 'lack', 'blood', 'body',
       'risk', 'sugar', 'weight', 'disease', 'good', 'food', 'junk',
       'high'], dtype='<U14')