In [1]:
%%time
from typing import List, Tuple
from collections import Counter
import re
import spacy

class StopWordKeywordExtractor:
    
    def _init_(self):
        # Set up SpaCy in a more efficient way by disabling what we do not need
        # This is the dependency parser (parser) and the named entity recognizer (ner)
        self.nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
        # Add the sentencizer to quickly split our text into sentences
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
        # Increase the maximum length of text SpaCy can parse in one go
        self.nlp.max_length = 1500000
        
    def is_proper_word(self, token:str) -> bool:
        '''
        Checks if the word is a proper word by our definition
        
        Arguments:
            token     -- The token as a string
        Return:
            is_proper -- True / False
        '''
        match = re.search(r'\b(\W+|\w+)\b', token)
        return match and token == match[0] 
    
    def keywords(self, text: str, n_keywords: int, min_words: int) -> List[Tuple[Tuple[str], int]]:
        '''
        Extract the top n most frequent keywords from the text.
        Keywords are sequences of adjectives and nouns that end in a noun
        
        Arguments:
            text       -- the raw text from which to extract keywords
            n_keywords -- the number of keywords to return
            min_words  -- the number of words a potential keyphrase has to include
                          if this is set to 2, then only keyphrases consisting of 2+ words are counted
        Returns:
            keywords   -- List of keywords and their count, sorted by the count
                          Example: [(('potato'), 12), (('potato', 'harvesting'), 9), ...]
        '''
        doc = self.nlp(text)
        keywords = []     
        perfect_keyword = []
        accumulator = []

                
        for token in doc:
            if (self.is_proper_word(str(token)) and not token.is_stop):
                accumulator.append(str(token))

            else:               
                if len(accumulator) >= min_words:
                    perfect_keyword.append(tuple(accumulator))
                    accumulator = []
                  
        keywords = Counter(perfect_keyword).most_common(n_keywords) 
        
        return keywords
        
with open('corpus.txt', 'r') as corpus_file:
    text = corpus_file.read()
    
keywords = StopWordKeywordExtractor().keywords(text.lower(), n_keywords=15, min_words=1)

'''
Expected output:
The keyword ('words',) appears 273 times.
The keyword ('text',) appears 263 times.
The keyword ('example',) appears 257 times.
The keyword ('word',) appears 201 times.
...
'''
for keyword in keywords:
    print('The keyword {} appears {} times.'.format(*keyword))

AttributeError: 'StopWordKeywordExtractor' object has no attribute 'nlp'