updated scraper to use new scroll_cursor param. fixes #7

DocNow · Dec 19, 2013 · ac14b9d · ac14b9d
1 parent 8241cd7
commit ac14b9d
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -24,9 +24,11 @@ How To Use
 Scrape Mode
 -----------
 
-If you pass the --scrape option to twarc it will use [search.twitter.com](http://search.twitter.com)
-to discover tweet ids, and then use the Twitter REST API to fetch the JSON for
-each tweet.
+The first time you fetch tweets for a query if you pass the --scrape option 
+it will use [search.twitter.com](http://search.twitter.com) to discover tweet 
+ids, and then use the Twitter REST API to fetch the JSON for each tweet. This
+is an expensive because each ID needs to be fetched from the API which counts
+as a request against your quota.
 
 [Twitter Search](http://search.twitter.com) [now supports](http://blog.twitter.com/2013/02/now-showing-older-tweets-in-search.html) drilling backwards in time, past the week cutoff of the REST API. Since individual tweets are still retrieved with the REST API, rate limits apply--so this is quite a slow process. Still, if you are willing to let it run for a while it can be useful to query for older tweets, until the official search REST API supports a more historical perspective.
 

diff --git a/twarc.py b/twarc.py
@@ -18,6 +18,7 @@
 token = oauth2.Token(config.access_token, config.access_token_secret)
 client = oauth2.Client(consumer, token)
 
+
 class RateLimiter:
 
     def __init__(self):
@@ -59,7 +60,8 @@ def _ping(self):
 
         logging.info("new rate limit remaining=%s and reset=%s", self.remaining, self.reset)
 
-def search(q, since_id=None, max_id=None, scrape=True):
+
+def search(q, since_id=None, max_id=None, scrape=True, only_ids=False):
     """returns a generator for *all* search results. If you supply scrape, 
     twarc will attemp to dig back further in time by scraping search.twitter.com
     and looking up individual tweets.
@@ -72,10 +74,11 @@ def search(q, since_id=None, max_id=None, scrape=True):
         for status in results:
             yield status
 
-    if scrape: 
+    if scrape and not since_id:
         for status in scrape_tweets(q, max_id=max_id):
             yield status
 
+
 def search_result(q, since_id=None, max_id=None):
     """returns a single page of search results
     """
@@ -100,6 +103,7 @@ def search_result(q, since_id=None, max_id=None):
 
     return statuses, new_max_id
 
+
 def fetch(url, tries=5):
     logging.info("fetching %s", url)
     if tries == 0:
@@ -117,7 +121,8 @@ def fetch(url, tries=5):
     time.sleep(secs)
 
     return fetch(url, tries - 1)
-
+
+
 def most_recent_id(q):
     since_id = None
     last_archive_file = last_archive(q)
@@ -127,6 +132,7 @@ def most_recent_id(q):
             since_id = json.loads(line)["id_str"]
     return since_id
 
+
 def last_archive(q):
     other_archive_files = []
     for filename in os.listdir("."):
@@ -139,6 +145,7 @@ def last_archive(q):
             return f
     return None
 
+
 def archive(q, statuses):
     t = time.strftime("%Y%m%d%H%M%S", time.localtime())
     archive_filename = "%s-%s.json" % (q, t)
@@ -151,33 +158,39 @@ def archive(q, statuses):
         fh.write(json.dumps(status))
         fh.write("\n")
 
+
 def scrape_tweets(query, max_id=None, sleep=1):
     """
     A kinda sneaky and slow way to retrieve older tweets, now that search on 
     the Twitter website extends back in time.
     """
-    for tweet_id in scrape_tweet_ids(query, sleep=5):
+    for tweet_id in scrape_tweet_ids(query, max_id, sleep=1):
         rate_limiter.check()
         url = "https://api.twitter.com/1.1/statuses/show.json?id=%s" % tweet_id
         resp, content = fetch(url)
         yield json.loads(content)
 
-def scrape_tweet_ids(query, max_id=None, sleep=1):
+
+def scrape_tweet_ids(query, max_id, sleep=1):
+    cursor = None
     url = 'https://twitter.com/i/search/timeline?'
     q = {
-        "type": "recent",
-        "src": "typd",
+        "q": query,
+        'f': 'realtime',
         "include_available_features": 1,
         "include_entities": 1,
-        "type": "recent",
-        "q": query
+        "last_note_ts": 0,
+        "oldest_unread_id": 0
     }
 
+#https://twitter.com/i/search/timeline?q=edsu%20dchud&f=realtime&include_available_features=1&include_entities=1&last_note_ts=0&oldest_unread_id=0&scroll_cursor=TWEET-390334032648884224-413331703495929856
+
     while True:
         logging.info("scraping tweets with id < %s", max_id)
-        if max_id:
-            q["max_id"] = max_id
+        if cursor:
+            q["scroll_cursor"] = cursor
 
+        logging.info("scraping %s", url + "?" + urllib.urlencode(q))
         r = requests.get(url, params=q, headers={'User-agent': USER_AGENT})
         s = json.loads(r.content)
 
@@ -187,15 +200,14 @@ def scrape_tweet_ids(query, max_id=None, sleep=1):
         if len(tweet_ids) == 0:
             raise StopIteration
 
-        # don't repeat the max_id
-        if max_id and max_id == tweet_ids[0]:
-            tweet_ids.pop(0)
-
         for tweet_id in tweet_ids:
             yield tweet_id
 
+        if not s['has_more_items']:
+            raise StopIteration
+
         time.sleep(sleep)
-        max_id = tweet_ids[-1]
+        cursor = s['scroll_cursor']
 
 logging.basicConfig(filename="twarc.log", level=logging.INFO)
 rate_limiter = RateLimiter()
@@ -210,4 +222,4 @@ def scrape_tweet_ids(query, max_id=None, sleep=1):
     since_id = most_recent_id(args.query)
     max_id = None
 
-    archive(args.query, search(args.query, since_id=since_id, max_id=args.maxid, scrape=args.scrape)) 
+    archive(args.query, search(args.query, since_id=since_id, max_id=args.maxid, scrape=args.scrape))