In [2]:
import os
import datetime
from time import sleep

import json
import tqdm

import requests
from twitterscraper import query_tweets

read ticker list

In [4]:
with open('./tickers.txt', 'r') as f:
    ticker_list = [t.strip() for t in f.readlines()]

The positive examples (the word IS a ticker) are retrieved from https://stocktwits.com/ .

Stocktwits is The Largest Social Network For Investors And Traders.

In [8]:
class UnknownStocktwitsError(RuntimeError):
    pass


class Stocktwits:
    """This class downloads the twits to ./stocktwits_{name} directory so, 
        that the dowloading could be aborted and continued
        To do so this class has some special features: 
            * log file: contains ticker name and the number of dowloaded twits
            * ticker list: updates by popping out the ticker name, which is complete
    """
    def __init__(self, count, ticker_list, name):
        """initial parameters:
             * count - how many twits for one ticket are needed, 
             * ticker_list - tickers to download, 
             * name - the name of directory for saving tickers
        """
        self.total_count = count
        self.ticker_list = ticker_list.copy()
        self.path = './stocktwits_{}/'.format(name)
        if not os.path.exists(self.path):
            os.mkdir(self.path)
        if not os.path.exists(self.path + 'log.json'):
            with open(self.path + 'log.json', 'w') as f:
                json.dump({}, f)
        
    def where_to_start(self):
        """gets the start parameters. 
           input: None
           output: 
                * ticker - ticker name
                * scrolling_arg - parameter for url adress to continue download
                * count - number of twits left to get
        """
        with open(self.path + 'log.json', 'r') as f:
            log = json.load(f)
        ticker = self.ticker_list[0]
        
        if ticker not in log:
            scrolling_arg = None
            count = self.total_count
        elif log[ticker]['count'] >= self.total_count:
            self.ticker_list.remove(ticker)
            
            return self.where_to_start()
        else:
            scrolling_arg = log[ticker]['scrolling_arg']
            count = self.total_count - log[ticker]['count']
            
        return ticker, scrolling_arg, count
    
    def download_data(self):
        """downloads twits 
            input: None
            output: 
                * ticker - ticker name
                * texts - twits as a list of dicts (json format)
                * scrolling_arg - parameter for url adress used the last
        """
        
        ticker, scrolling_arg, count = self.where_to_start()
        # Stocktwits.com returns twits in batches sized 30
        # n = number of requsts needed
        n = count // 30 + 1
        texts = []
        
        if scrolling_arg:
            r = requests.get(
                'https://api.stocktwits.com/api/2/streams/symbol/{}.json?'.format(ticker),
                params={
                    'filter': 'all',
                    'max': scrolling_arg
                }
            )
        else:
            r = requests.get(
                'https://api.stocktwits.com/api/2/streams/symbol/{}.json?'.format(ticker),
                params={
                    'filter': 'all'
                }
            )
                
        for i in tqdm.tqdm(range(n)):
            
            content = json.loads(r.text)
            status = content['response']['status']
            
            if status != 200:               
                print('ticker', ticker)
                print('status: ', content['response']['status'])
                print('error: ', content['errors'][0]['message'])
                
                # if no twits left or there are no twits with demanded ticker
                if status == 404:
                    self.ticker_list.remove(ticker) 
                if texts: 
                    return ticker, texts, scrolling_arg
                else:
                    # here mostly goes "requests limit exceeded" error
                    raise UnknownStocktwitsError

            messages = content['messages']
            
            texts.extend(messages)
            
            #scrolling parameters
            cursor_more = content['cursor']['more']
            scrolling_arg = content['cursor']['max']
            
            if cursor_more:
                r = requests.get(
                    'https://api.stocktwits.com/api/2/streams/symbol/{}.json?'.format(ticker),
                    params={
                        'filter': 'all',
                        'max': scrolling_arg
                    }
                )
            else:
                self.ticker_list.remove(ticker)
                return ticker, texts, scrolling_arg

        return ticker, texts, scrolling_arg    
      
    def save_results(self, ticker, texts, scrolling_arg):
        """save twits as a json file to the `self.path` directory and write log to log.json
            input: ticker, texts, scrolling_arg (see `self.download_data`)
            output: None
        """
        
        full_path = '{}{}_{}_{}.json'.format(
            self.path,
            ticker,
            scrolling_arg,
            datetime.datetime.now()
        )
         
        with open(full_path, 'w') as f:
            json.dump(texts, f)

        with open(self.path + 'log.json', 'r') as f:
            log = json.load(f)

        if ticker in log:
            log[ticker]['scrolling_arg'] = scrolling_arg
            log[ticker]['count'] += len(texts)
            if log[ticker]['count'] >= self.total_count:
                self.ticker_list.remove(ticker)
        else:
            log[ticker] = {
                'scrolling_arg': scrolling_arg,
                'count': len(texts)
            }

        with open(self.path + 'log.json', 'w') as f:
            json.dump(log, f)
            
    def try_to_download(self):
        """This function downloads all needed data.
        Stocktwits.com allows only 200 requests per hour, 
        that's why after failure the program goes to sleep for 15 minutes"""
        while len(self.ticker_list) > 0:
            try:
                ticker, texts, scrolling_arg = self.download_data()
                print(ticker, 'saved')
                self.save_results(ticker, texts, scrolling_arg)
            except UnknownStocktwitsError:
                print('sleeping at ', datetime.datetime.now().time())
                sleep(15 * 60)

In [80]:
stktwts = Stocktwits(3000, ticker_list, 'hope')

In [None]:
stktwts.try_to_download()

The negative examples (the word is NOT a ticker) are retrieved from https://twitter.com/ using twitterscraper https://github.com/taspinar/twitterscraper.

In [None]:
# gets about 3000 (parameter 'limit') tweets per ticker, starting from 01/01/2018 (parameter 'begindate') 
for query_word in tqdm.tqdm(ticker_list, total=len(ticker_list)):
    tweets = [tweet.text.replace('\n', ' ') for tweet in query_tweets(
                query_word,  
                limit=3000, 
                begindate=datetime.date(2018, 1, 1),
                lang='en'
                )
             ]
    print(query_word, len(tweets))

    with open('./twitter/{}_{}'.format(query_word, datetime.datetime.now()), 'w') as f:
        f.write('\n'.join(tweets))