-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathquestions.py
executable file
·232 lines (173 loc) · 6.9 KB
/
questions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import nltk
import sys
import os
import string
from math import log
FILE_MATCHES = 1
SENTENCE_MATCHES = 1
def main():
# Check command-line arguments
if len(sys.argv) != 2:
sys.exit("Usage: python questions.py corpus")
# Calculate IDF values across files
files = load_files(sys.argv[1])
file_words = {
filename: tokenize(files[filename])
for filename in files
}
file_idfs = compute_idfs(file_words)
# Prompt user for query
query = set(tokenize(input("Query: ")))
# Determine top file matches according to TF-IDF
filenames = top_files(query, file_words, file_idfs, n=FILE_MATCHES)
# Extract sentences from top files
sentences = dict()
for filename in filenames:
for passage in files[filename].split("\n"):
for sentence in nltk.sent_tokenize(passage):
tokens = tokenize(sentence)
if tokens:
sentences[sentence] = tokens
# Compute IDF values across sentences
idfs = compute_idfs(sentences)
# Determine top sentence matches
matches = top_sentences(query, sentences, idfs, n=SENTENCE_MATCHES)
for match in matches:
print(match)
def load_files(directory):
"""
Given a directory name, return a dictionary mapping the filename of each
`.txt` file inside that directory to the file's contents as a string.
"""
# get the list of files and directories in the given directory
# and get only the files ending with .txt from there
filenames = os.listdir(directory)
txtfiles = [filename for filename in filenames
if filename[-4:] == '.txt']
# for each file in the list of txt files in the directory,
# read the contents as a string
# and in a dictionary with the txt file name as the key
# map it to the corresponding string extracted
files = dict()
for afile in txtfiles:
path = os.path.join(directory, afile)
with open(path, 'r') as txt:
contents = txt.read()
files[afile] = contents
return files
def tokenize(document):
"""
Given a document (represented as a string), return a list of all of the
words in that document, in order.
Process document by coverting all words to lowercase, and removing any
punctuation or English stopwords.
"""
# get the tokens of words from the document
# and for each token, append its lowercase form to the list of words if
# the token is not a puctuation and is not a stopword in English
tokens = nltk.word_tokenize(document)
words = []
for token in tokens:
if token not in string.punctuation and \
token not in nltk.corpus.stopwords.words("english"):
words.append(token.lower())
return words
def compute_idfs(documents):
"""
Given a dictionary of `documents` that maps names of documents to a list
of words, return a dictionary that maps words to their IDF values.
Any word that appears in at least one of the documents should be in the
resulting dictionary.
"""
# define two dictionary
# one to store the idf value for each unique word across all documents
# one to count the no. of documents in which that word appears
idf = dict()
count = dict()
for filename in documents:
# get the unique words in the file
bag_of_words = set(documents[filename])
for word in bag_of_words:
# if the word is already present in the dictionary, increase count
# else create new dictionary entry and initialize it to 1
if word in count:
count[word] += 1
else:
count[word] = 1
# calculate idf for each word using formula
num_of_documents = len(documents)
for word in count:
idf[word] = log(num_of_documents / count[word])
return idf
def top_files(query, files, idfs, n):
"""
Given a `query` (a set of words), `files` (a dictionary mapping names of
files to a list of their words), and `idfs` (a dictionary mapping words
to their IDF values), return a list of the filenames of the the `n` top
files that match the query, ranked according to tf-idf.
"""
# for each doc file calculate the corresponding tf-idf value
tfidf = dict()
for doc in files:
# value is for summing up the tf-idf value for each word in query
# for a particular document
# also get all the words inn that document
value = 0
words = files[doc]
for word in query:
# count no. of times the word in query appears in current doc
tf = words.count(word)
# if count == 0, idf becomes undefined, so set idf to 0
if tf > 0:
idf = idfs[word]
else:
idf = 0
# sum up the product of tf and idf for current word and current doc
value += tf * idf
tfidf[doc] = value
# rank docs according to the value of tf-idf in descending order
ranking = [document for document, val in sorted(tfidf.items(), key=lambda item: item[1], reverse=True)]
# get the top n documents from the ranking
top_n = ranking[:n]
return top_n
def top_sentences(query, sentences, idfs, n):
"""
Given a `query` (a set of words), `sentences` (a dictionary mapping
sentences to a list of their words), and `idfs` (a dictionary mapping words
to their IDF values), return a list of the `n` top sentences that match
the query, ranked according to idf. If there are ties, preference should
be given to sentences that have a higher query term density.
"""
# Since IDF must be prioritized first and
# 'query term density' gets second priority,
# give a weight of 90% to query term density
# Note: IDF has a weight of 100%
# idf dictionary maps sentence to its score
# score = idf + 0.9 * query_term_density
weight = 0.9
idf = dict()
for sentence in sentences:
# 'count' to calculate query term density
# 'score' to calculate the overall score
count = 0
score = 0
words = sentences[sentence]
length = len(set(words))
for word in query:
if word not in words:
continue
# if the word in query occurs in the current sentence
# increment count and sum up its idf value
count += 1
score += idfs[word]
query_term_density = count / length
score = score + weight * query_term_density
idf[sentence] = score
# get the ranking by sorting the idf dictionary by score
# in descending order
ranking = [sent for sent, wrds in sorted(idf.items(), key=lambda item: item[1], reverse=True)]
# get the top n sentences from the ranking
top_n = ranking[:n]
return top_n
if __name__ == "__main__":
main()