forked from DocNow/twarc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
discover_ids.py
executable file
·76 lines (60 loc) · 2.19 KB
/
discover_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
"""
This is an imperfect way of discovering tweet ids that match a particular
query using the infinite scroll windown on Twitter's website. It doesn't
yield complete or quick results; so really it can only be used for smallish
queries. Once you have the tweet ids for a given query you can hydrate
them with twarc.py. For example:
discover_ids.py '#code4lib' > ids.txt
twarc.py --hydrate ids.txt > tweets.json
"""
from __future__ import print_function
import re
import json
import time
import random
import logging
import argparse
import calendar
import requests
def main():
parser = argparse.ArgumentParser("discover_ids")
parser.add_argument("query", action="store",
help="tweets to search for")
args = parser.parse_args()
logging.basicConfig(filename="discover.log", level=logging.INFO)
for id in discover_ids(args.query):
print(id)
def discover_ids(query):
cursor = None
url = 'https://twitter.com/i/search/timeline?'
q = {
"q": query,
'f': 'realtime',
"src": "typd",
"include_available_features": 1,
"include_entities": 1,
"oldest_unread_id": 0
}
while True:
logging.info("collecting tweet ids with cursor=%s", cursor)
q["last_note_ts"] = calendar.timegm(time.gmtime())
if cursor:
q["scroll_cursor"] = cursor
r = requests.get(url, headers={"user-agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}, params=q)
s = json.loads(r.content)
html = s["items_html"]
tweet_ids = re.findall(r'<a href=\"/.+/status/(\d+)', html)
logging.info("discovered tweet ids: %s", tweet_ids)
if len(tweet_ids) == 0:
logging.debug("no more tweet ids: %s", html)
raise StopIteration
for tweet_id in tweet_ids:
yield tweet_id
# seems to fetch more tweets when we sleep a random amount of time?
seconds = random.randint(3, 8)
logging.debug("sleeping for %s" % seconds)
time.sleep(seconds)
cursor = s['scroll_cursor']
if __name__ == "__main__":
main()