-
Notifications
You must be signed in to change notification settings - Fork 1
/
question.py
91 lines (72 loc) · 2.86 KB
/
question.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python
# -*- coding: cp1252 -*-
### Imports ###
import nltk
import re
import sys
import os
import pickle
###############
if(len(sys.argv) != 1):
print "Please call question.py as follows: python question.py <article.txt>!"
exit(1)
### Globals ###
regexp_tagger = nltk.RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'([Tt]h(e)|(at))|([Aa]n?)$', 'DT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN'), # nouns (default)
])
unigram_tagger = pickle.load(open("unigram_tagger.bin", "r"))
bigram_tagger = pickle.load(open("bigram_tagger.bin", "r"))
trigram_tagger = pickle.load(open("trigram_tagger.bin", "r"))
stokenizer = nltk.PunktSentenceTokenizer()
wtokenizer = nltk.PunktWordTokenizer()
###############
def question():
"""Tags and tokenizes a file and attempts to return a list of easy questions."""
articleFile = "article.txt.sample" #sys.argv[1]
### Read the article ###
article = open(articleFile, "r").read()
sentences = stokenizer.tokenize(article)
words = [wtokenizer.tokenize(sentence) for sentence in sentences]
stags = [trigram_tagger.tag(word) for word in words]
r1 = re.compile("([^; ]*) DT;([^ ]*) NN;([^ ]*) VBD;([^ ]*) IN;([^ ]*) NNP")
for stag in stags:
s = ";".join([" ".join(st) for st in stag])
m = r1.findall(s)
if len(m)>0:
print ' '.join(['Did',m[0][0],m[0][1],m[0][2][0:-1],m[0][3],m[0][4]])+'?'
#cfgFile = "grammar.rules"
#cfg = open(cfgFile, "r").read()
#grammarstags = []
#for stag in stags:
# for tag in stag:
# grammarstags += [tag]
#grammarstags = list(set(grammarstags))
#for (word, tag) in grammarstags:
# cfg += tag + ' -> "' + word + '"\n'
#print cfg
#nltk.cfg._PARSE_CFG_RE = re.compile(
# r'''^\s*([\w,]+(?:[-/]\w+)?)\s*(?:[-=]+>)\s*(?:("[^"]+"|'[^']+'|[,\w-]+(?:/[\w-]+)?|\|)\s*)*$'''
# ,re.VERBOSE)
#nltk.cfg._SPLIT_CFG_RE = re.compile(r'''([\w,]+(?:[/-]\w+)?|[-=]+>|"[^"]+"|'[^']+'|\|)''')
#rd_parser = nltk.RecursiveDescentParser(nltk.parse_cfg(cfg))
#print words[0]
#print rd_parser.nbest_parse(words[0])
### Print the length of the article in words (just as a sanity check here) ###
#print "DEBUG: " + str(len(article)) + " characters; " + str(len(stags)) + " sentences; " +
### Search the article for our regex ###
#answer = re.findall(regex, article, re.IGNORECASE)
#if(answer[0].lower() == regex.lower()):
# print "Yes"
#else:
# print "No"
### If running as a console script ###
if (__name__ == "__main__"):
question()