/
keyword_seeding_using_nltk.py
121 lines (98 loc) · 4.01 KB
/
keyword_seeding_using_nltk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import nltk
from nltk.corpus import stopwords
from collections import Counter
from model import Keyword, connect_to_db, db
import pprint
from nltk.stem.wordnet import WordNetLemmatizer
def extracting_keywords_from_text(list_of_book_objects):
""" Takes a list of book objects, and if the book has a description, create
keyword and keyword phrases and store the top twenty in the local database.
Keyword and keyword phrases are stored in a keyword table as well as a
keyword-book association table."""
for book_obj in list_of_book_objects:
if book_obj.description:
print "book: ", book_obj.title, "description: ", book_obj.description
tree = tokenize_tag_text(book_obj.description)
print "tree:"
pprint(tree)
terms = get_terms(tree)
print "terms:"
pprint(terms)
print "#" * 40
print book_obj.title
list_of_terms = []
for term in terms:
term_phrase = " ".join(term)
print "term phrase, ", term_phrase
list_of_terms.append(term_phrase)
print list_of_terms
count = Counter(list_of_terms)
top_twenty_terms = count.most_common(20)
print top_twenty_terms
# print type(top_twenty_terms)
print "#" * 40
print "#" * 40
print "#" * 40
print "#" * 40
for term in top_twenty_terms:
keyword_instance = Keyword(keyword=term[0])
if not Keyword.query.filter_by(keyword=term[0]).first():
db.session.add(keyword_instance)
else:
print "the word, ", term[0], " already in database"
keyword_instance.books.append(book_obj)
def tokenize_tag_text(description):
"""Removes some punctuation, tags each word by part-of-speech, and
generates keyword and keyword prhases based on noun phrases patterns
using regexp."""
sentence_re = r'''(?x)
([A-Z])(\.[A-Z])+\.? # set flag to allow verbose regexps
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages
| \.\.\. # ellipsis
| [][.,;"?():-_`] # separate tokens
'''
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
{<NNP|NNPS>+<IN>?<NNP|NNPS>+} # A sequence of proper nouns connected with zero or more prepositions
{<DT|PP\$>?<JJ>*<NN|NNS>} # Determiners (e.g. 'the', 'a') or possessive, followed by one or more adjective
{<NN>+} # A sequence of one or more nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>}
"""
chunker = nltk.RegexpParser(grammar)
toks = nltk.regexp_tokenize(description, sentence_re)
postoks = nltk.tag.pos_tag(toks)
tree = chunker.parse(postoks)
return tree
def get_terms(tree):
"""Returns acceptable, lemmatized keywords."""
for leaf in leaves(tree):
for word in leaf:
if word[1] != 'NNP':
term = [normalise(word) for word, tag in leaf if acceptable_word(word)]
else:
term = [word for word, tag in leaf]
yield term
def leaves(tree):
"""Finds NP (nounphrase) leaf nodes of a chunk tree."""
for subtree in tree.subtrees(filter=lambda t: t.label()=='NP'):
yield subtree.leaves()
def normalise(word):
"""Normalises words to lowercase and lemmatizes it."""
lemmatizer = WordNetLemmatizer()
word = word.lower()
word = lemmatizer.lemmatize(word)
return word
def acceptable_word(word):
"""Checks conditions for acceptable word: length, stopword."""
stopwords1 = stopwords.words('english')
accepted = bool(2 <= len(word) <= 40
and word.lower() not in stopwords1)
return accepted
if __name__ == "__main__":
connect_to_db(app)
db.create_all()
extracting_keywords_from_text(list_of_book_objects)