-
Notifications
You must be signed in to change notification settings - Fork 0
/
annotate.py
86 lines (70 loc) · 2.76 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import json
import argparse
import pickle
import os
import time
import logging
from spotlight import SpotlightException
from requests import RequestException
from twittersql.database import tweets_without_concepts, update_tweet_concepts
from twittersql.spotlight import clean_tweet, get_annotation
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--quiet', action='store_true')
return parser.parse_args()
def read_regions(path=os.path.join('regions.json')):
"""Load regions JSON file"""
with open(path, 'r') as f:
regions = json.load(f)
return regions
def main():
args = parse_args()
logFormatter = logging.Formatter("%(asctime)s [%(levelname)-5.5s] %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fileHandler = logging.FileHandler("annotate.log")
fileHandler.setFormatter(logFormatter)
logger.addHandler(fileHandler)
if not args.quiet:
consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
logger.addHandler(consoleHandler)
regions = read_regions()
# set of tweet id where resources were not found
try:
with open('no_resources_found.pickle', 'rb') as f:
no_resources_found = pickle.load(f)
except (FileNotFoundError, OSError):
logger.warning("Setting up new NRF pickle ")
time.sleep(5)
no_resources_found = set()
for region in regions:
logger.info(region)
language = regions[region]['language']
twc = tweets_without_concepts(region)
amount = len(list(twc))
logger.info("{} tweets to annotate".format(amount))
time.sleep(1)
# loop over tweets and annotate them and write back into the db with the JSON response
for index, tweet in enumerate(twc):
tweet_id = tweet.tweet_id
if tweet_id not in no_resources_found:
text = tweet.tweet_body.get('text')
clean = clean_tweet(text)
try:
r = get_annotation(language=language, text=clean)
except SpotlightException as e:
logger.warning(e)
no_resources_found.add(tweet_id)
except RequestException as e:
logger.error(e)
else:
update_tweet_concepts(tweet_id, r)
forms = ', '.join([t['surfaceForm'] for t in r])
logger.info("{}/{} {} ({}) - tweet_id: {} - forms: {}".format(
index, amount, region, language, tweet_id, forms))
else:
with open('no_resources_found.pickle', 'wb') as f:
pickle.dump(no_resources_found, f)
if __name__ == '__main__':
main()