Skip to content

Commit

Permalink
updated scraper to use new scroll_cursor param. fixes #7
Browse files Browse the repository at this point in the history
  • Loading branch information
edsu committed Dec 19, 2013
1 parent 8241cd7 commit ac14b9d
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 20 deletions.
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ How To Use
Scrape Mode
-----------

If you pass the --scrape option to twarc it will use [search.twitter.com](http://search.twitter.com)
to discover tweet ids, and then use the Twitter REST API to fetch the JSON for
each tweet.
The first time you fetch tweets for a query if you pass the --scrape option
it will use [search.twitter.com](http://search.twitter.com) to discover tweet
ids, and then use the Twitter REST API to fetch the JSON for each tweet. This
is an expensive because each ID needs to be fetched from the API which counts
as a request against your quota.

[Twitter Search](http://search.twitter.com) [now supports](http://blog.twitter.com/2013/02/now-showing-older-tweets-in-search.html) drilling backwards in time, past the week cutoff of the REST API. Since individual tweets are still retrieved with the REST API, rate limits apply--so this is quite a slow process. Still, if you are willing to let it run for a while it can be useful to query for older tweets, until the official search REST API supports a more historical perspective.

Expand Down
46 changes: 29 additions & 17 deletions twarc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
token = oauth2.Token(config.access_token, config.access_token_secret)
client = oauth2.Client(consumer, token)


class RateLimiter:

def __init__(self):
Expand Down Expand Up @@ -59,7 +60,8 @@ def _ping(self):

logging.info("new rate limit remaining=%s and reset=%s", self.remaining, self.reset)

def search(q, since_id=None, max_id=None, scrape=True):

def search(q, since_id=None, max_id=None, scrape=True, only_ids=False):
"""returns a generator for *all* search results. If you supply scrape,
twarc will attemp to dig back further in time by scraping search.twitter.com
and looking up individual tweets.
Expand All @@ -72,10 +74,11 @@ def search(q, since_id=None, max_id=None, scrape=True):
for status in results:
yield status

if scrape:
if scrape and not since_id:
for status in scrape_tweets(q, max_id=max_id):
yield status


def search_result(q, since_id=None, max_id=None):
"""returns a single page of search results
"""
Expand All @@ -100,6 +103,7 @@ def search_result(q, since_id=None, max_id=None):

return statuses, new_max_id


def fetch(url, tries=5):
logging.info("fetching %s", url)
if tries == 0:
Expand All @@ -117,7 +121,8 @@ def fetch(url, tries=5):
time.sleep(secs)

return fetch(url, tries - 1)



def most_recent_id(q):
since_id = None
last_archive_file = last_archive(q)
Expand All @@ -127,6 +132,7 @@ def most_recent_id(q):
since_id = json.loads(line)["id_str"]
return since_id


def last_archive(q):
other_archive_files = []
for filename in os.listdir("."):
Expand All @@ -139,6 +145,7 @@ def last_archive(q):
return f
return None


def archive(q, statuses):
t = time.strftime("%Y%m%d%H%M%S", time.localtime())
archive_filename = "%s-%s.json" % (q, t)
Expand All @@ -151,33 +158,39 @@ def archive(q, statuses):
fh.write(json.dumps(status))
fh.write("\n")


def scrape_tweets(query, max_id=None, sleep=1):
"""
A kinda sneaky and slow way to retrieve older tweets, now that search on
the Twitter website extends back in time.
"""
for tweet_id in scrape_tweet_ids(query, sleep=5):
for tweet_id in scrape_tweet_ids(query, max_id, sleep=1):
rate_limiter.check()
url = "https://api.twitter.com/1.1/statuses/show.json?id=%s" % tweet_id
resp, content = fetch(url)
yield json.loads(content)

def scrape_tweet_ids(query, max_id=None, sleep=1):

def scrape_tweet_ids(query, max_id, sleep=1):
cursor = None
url = 'https://twitter.com/i/search/timeline?'
q = {
"type": "recent",
"src": "typd",
"q": query,
'f': 'realtime',
"include_available_features": 1,
"include_entities": 1,
"type": "recent",
"q": query
"last_note_ts": 0,
"oldest_unread_id": 0
}

#https://twitter.com/i/search/timeline?q=edsu%20dchud&f=realtime&include_available_features=1&include_entities=1&last_note_ts=0&oldest_unread_id=0&scroll_cursor=TWEET-390334032648884224-413331703495929856

while True:
logging.info("scraping tweets with id < %s", max_id)
if max_id:
q["max_id"] = max_id
if cursor:
q["scroll_cursor"] = cursor

logging.info("scraping %s", url + "?" + urllib.urlencode(q))
r = requests.get(url, params=q, headers={'User-agent': USER_AGENT})
s = json.loads(r.content)

Expand All @@ -187,15 +200,14 @@ def scrape_tweet_ids(query, max_id=None, sleep=1):
if len(tweet_ids) == 0:
raise StopIteration

# don't repeat the max_id
if max_id and max_id == tweet_ids[0]:
tweet_ids.pop(0)

for tweet_id in tweet_ids:
yield tweet_id

if not s['has_more_items']:
raise StopIteration

time.sleep(sleep)
max_id = tweet_ids[-1]
cursor = s['scroll_cursor']

logging.basicConfig(filename="twarc.log", level=logging.INFO)
rate_limiter = RateLimiter()
Expand All @@ -210,4 +222,4 @@ def scrape_tweet_ids(query, max_id=None, sleep=1):
since_id = most_recent_id(args.query)
max_id = None

archive(args.query, search(args.query, since_id=since_id, max_id=args.maxid, scrape=args.scrape))
archive(args.query, search(args.query, since_id=since_id, max_id=args.maxid, scrape=args.scrape))

0 comments on commit ac14b9d

Please sign in to comment.