## I chose three cruelty free and vegan makeup products from Twitter as my target brands. 

In [0]:
from google.colab import drive
from pathlib import Path

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
try:
    import birdy
except ModuleNotFoundError:
    !pip install birdy
try:
    import ratelimiter
except ModuleNotFoundError:
    !pip install ratelimiter

Collecting birdy
  Downloading https://files.pythonhosted.org/packages/cc/30/3f825b8d4248ebd9de9d218ba4b931c93be664e077c328c4b6dd19eb9d8a/birdy-0.3.2.tar.gz
Building wheels for collected packages: birdy
  Building wheel for birdy (setup.py) ... [?25l[?25hdone
  Created wheel for birdy: filename=birdy-0.3.2-cp36-none-any.whl size=10853 sha256=50a0fe13b93807e1c1a993ba49e9630900982fd9de001b43348b448dffcce0ce
  Stored in directory: /root/.cache/pip/wheels/ad/f9/a7/928ef99a65cfa8182e42fb0a052b0a61faa69b7d085fae2723
Successfully built birdy
Installing collected packages: birdy
Successfully installed birdy-0.3.2
Collecting ratelimiter
  Downloading https://files.pythonhosted.org/packages/51/80/2164fa1e863ad52cc8d870855fba0fbb51edd943edffd516d54b5f6f8ff8/ratelimiter-1.2.0.post0-py3-none-any.whl
Installing collected packages: ratelimiter
Successfully installed ratelimiter-1.2.0.post0


In [0]:
import json, os, sys, time
from zipfile import ZipFile
from birdy.twitter import AppClient, UserClient, TwitterRateLimitError
from ratelimiter import RateLimiter


"""
Credentials can be found by selecting the "Keys and tokens" tab for your
application selected from:

https://developer.twitter.com/en/apps/
"""
DEFAULT_MAX_TWEETS = 10_000

def limited(until):
    duration = int(round(until - time.time()))
    print('Rate limited, sleeping for {:d} seconds'.format(duration))


class TwitterSearcher():

    def __init__(self, consumer_key, consumer_secret,
                 output_dir, max_tweets=DEFAULT_MAX_TWEETS):
        self._consumer_key = consumer_key
        self._consumer_secret = consumer_secret
        self._output_dir = output_dir
        self._max_tweets = max_tweets
        self._client = None
        self._max_id = None

    def client(self):
        if self._client is None:
            _cl = AppClient(self._consumer_key, self._consumer_secret)
            access_token = _cl.get_access_token()
            self._client = AppClient(
                self._consumer_key, self._consumer_secret, access_token)
        return self._client

    @RateLimiter(max_calls=440, period=60*15, callback=limited)
    def fetch_tweets(self, query):
        print(f'Fetching: "{query}" TO MAX ID: {self._max_id}')
        client = self.client()
        try:
            tweets = client.api.search.tweets.get(
                q=query,
                count=100,
                max_id=self._max_id).data['statuses']
        except TwitterRateLimitError:
            print("You've reached your Twitter API rate limit. "\
                "Wait 15 minutes before trying again")
            sys.exit()
        try:
            id_ = min([tweet['id'] for tweet in tweets])
        except ValueError:
            return None
        if self._max_id is None or id_ <= self._max_id:
            self._max_id = id_ - 1
        return tweets

    def initialize_max_id(self, file_list):
        for fn in file_list:
            n = int(fn.split('.')[0])
            if self._max_id is None or n < self._max_id:
                self._max_id = n - 1
        if self._max_id is not None:
            print('Found previously fetched tweets. '\
                  'Setting max_id to %d' % self._max_id)

    def halt(self, _id):
        print('Reached historically fetched ID: %d' % _id)
        print('In order to re-fetch older tweets, ' \
            'remove tweets from the output directory or output zip file.')
        print('\n!!IMPORTANT: Tweets older than 7 days will not be re-fetched')
        return

    def search(self, query, dozip=True, verbose=False):
        output_dir = os.path.join(self._output_dir, '_'.join(query.split()))
        outzip = None
        self._max_id = None
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if dozip:
            fn = os.path.join(output_dir, '%s.zip' % '_'.join(query.split()))
            outzip = ZipFile(fn, 'a')
        if dozip:
            file_list = [f for f in outzip.namelist() if f.endswith('.json')]
        else:
            file_list = [f for f in os.listdir(output_dir) if f.endswith('.json')]
        self.initialize_max_id(file_list)
        try:
            while True:
                tweets = self.fetch_tweets(query)
                if tweets is None:
                    print('Search Completed')
                    return
                for tweet in tweets:
                    if verbose:
                        print(tweet['id'])
                    fn = '%d.json' % tweet['id']
                    if dozip:
                        if fn in (file_list):
                            self.halt(tweet['id'])
                        else:
                            outzip.writestr(fn, json.dumps(tweet, indent=4))
                            file_list.append(fn)
                    else:
                        path = os.path.join(output_dir, fn)
                        if fn in (file_list):
                            self.halt(tweet['id'])
                        else:
                            with open(path, 'w') as outfile:
                                json.dump(tweet, outfile, indent=4)
                            file_list.append(fn)
                    if len(file_list) >= self._max_tweets:
                        print('Reached maximum tweet limit of: %d' % self._max_tweets)
                        return
        except KeyboardInterrupt:
            print('Search interrupted. Re-run to continue.')
            sys.exit()
        except:
            raise
        finally:
            if outzip is not None:
                outzip.close()

In [0]:
CONSUMER_KEY= 'pfV1yY856ELPZNK1Q1nw9PAW2'
CONSUMER_SECRET= 'd7807z2H7jEvqMyMLu2R5qLgtvGdmVQdVhIybNmT8dSJv0LwGf'
OUTPUT_DIR = 'drive/My Drive/Colab Notebooks/twitter'

searcher = TwitterSearcher(CONSUMER_KEY, CONSUMER_SECRET, OUTPUT_DIR)
searcher.search('@tartecosmetics', dozip=True)
searcher.search('@Urban Decay', dozip=True)
searcher.search('@Glossier', dozip=True)

Fetching: "@tartecosmetics" TO MAX ID: None
Fetching: "@tartecosmetics" TO MAX ID: 1229628506306277375
Fetching: "@tartecosmetics" TO MAX ID: 1229602131902156799
Fetching: "@tartecosmetics" TO MAX ID: 1229596037221883904
Fetching: "@tartecosmetics" TO MAX ID: 1229524790312132615
Fetching: "@tartecosmetics" TO MAX ID: 1229386299884417023
Fetching: "@tartecosmetics" TO MAX ID: 1229192858567151616
Fetching: "@tartecosmetics" TO MAX ID: 1229112010379534335
Fetching: "@tartecosmetics" TO MAX ID: 1229072571049684991
Fetching: "@tartecosmetics" TO MAX ID: 1229046034149576703
Fetching: "@tartecosmetics" TO MAX ID: 1228867539612184576
Fetching: "@tartecosmetics" TO MAX ID: 1228761744392118271
Fetching: "@tartecosmetics" TO MAX ID: 1228544582603943937
Fetching: "@tartecosmetics" TO MAX ID: 1228439555772207109
Fetching: "@tartecosmetics" TO MAX ID: 1228369299707793407
Fetching: "@tartecosmetics" TO MAX ID: 1228289929710637055
Fetching: "@tartecosmetics" TO MAX ID: 1228162087354019839
Fetching: "@