In [2]:
import tweepy as tw
import pandas as pd
import pickle
from searchtweets import ResultStream, gen_request_parameters, load_credentials, read_config
from datetime import datetime, timedelta
import time
import pickle
import argparse
from datetime import timedelta

In [3]:
# define date range function
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

# Save pickle file function
def save_object(obj, filename):
    with open(filename, 'ab') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

In [4]:
# Get Daily Tweet Counts
def get_tweet_counts(query,start,end):
   counts = client.get_all_tweets_count(query=query,end_time=end,
                                        start_time=start, granularity='day')
   daily_counts = pd.DataFrame(columns = ["Day", "Tweet Volume"])
   for count in counts.data:
      c = pd.DataFrame({"Day" : count["start"],
                        "Tweet Volume" : count["tweet_count"]},index=[0])
      daily_counts = pd.concat([daily_counts, c] , ignore_index=True)
   return daily_counts

In [5]:
def get_tweets(query, prefix, start, end):
    for single_date in daterange(start, end):
         
      # set start timestamp
      start_ts = single_date

      # set end timestamp
      end_ts =  single_date + timedelta(days=1)

      # payload rules for v2 api
      rule = gen_request_parameters(query = query,
                                    results_per_call = 500,
                                    start_time = start_ts.isoformat(),
                                    end_time = end_ts.isoformat(),
                                    tweet_fields = tweet_fields,
                                    user_fields = user_fields,
                                    expansions = 'author_id',
                                    granularity = None,
                                    stringify = False)

      # result stream from twitter v2 api
      rs = ResultStream(request_parameters = rule,
                        max_results = 1000000,
                        max_pages = 1,
                        max_tweets = 10000,
                        **search_creds)

      # number of reconnection tries
      tries = 10

      while True:
         tries -= 1
         try:
                  # indicate which day is getting retrieved
               print(f'[INFO] - Retrieving tweets from {str(start_ts)}, for querry: {query}')
               with open('../Logs/Queries.txt', 'a') as f:
                    f.write(f'[INFO] - Retrieving tweets from {str(start_ts)}, for querry: {query} \n')

               # get json response to list
               tweets = list(rs.stream())

               # break free from while loop
               break
         except Exception as err:
               if tries == 0:
                  raise err
               print(f'[INFO] - Got connection error, waiting 15 seconds and trying again. {tries} tries left.')
               with open('../Logs/Queries.txt', 'a') as f:
                    f.write(f'[INFO] - Got connection error, waiting 15 seconds and trying again. {tries} tries left. \n')

               time.sleep(15)

      # parse results to dataframe
      print(f'[INFO] - Saving tweets from {str(start_ts)}')
      with open('../Logs/Queries.txt', 'a') as f:
           f.write(f'[INFO] - Saving tweets from {str(start_ts)} \n')
      file_prefix_w_date = prefix + start_ts.isoformat()
      outpickle = f'{file_prefix_w_date}.pkl'
      save_object(tweets, outpickle)

In [11]:
#Define the search parameters
date_start = "2019-01-01"
date_end = "2022-01-01"
user_fields = ",".join(['public_metrics'])
tweet_fields = ",".join(['created_at','geo','id','lang', 'public_metrics',
                        'source','text'])

# get waittime
waittime = 45

# load twitter keys
search_creds = load_credentials('.twitter_keys.yaml',
                                 yaml_key = 'search_tweets_v2',
                                 env_overwrite = False)

# set interval to loop through
start_date = datetime.strptime(date_start, '%Y-%m-%d').date()
end_date = datetime.strptime(date_end, '%Y-%m-%d').date()

In [12]:
# Create a dictionairy for queries
companies = ["Apple", "Amazon", "Tesla"]
ceos = ["Tim Cook", "Jeff Bezos","Elon Musk"]
tickers = ["AAPL", "AMZN", "TSLA"]

In [13]:
for ticker in tickers:
    search_words = f"{ticker} -is:retweet -is:reply lang:en"
    filename_prefix = f"../Data/RawTweets/Tickers/{ticker}/"
    
    get_tweets(search_words, filename_prefix, start_date, end_date)

[INFO] - Retrieving tweets from 2021-12-31, for querry: AAPL -is:retweet -is:reply lang:en
[INFO] - Saving tweets from 2021-12-31
[INFO] - Retrieving tweets from 2021-12-31, for querry: AMZN -is:retweet -is:reply lang:en
[INFO] - Saving tweets from 2021-12-31
[INFO] - Retrieving tweets from 2021-12-31, for querry: TSLA -is:retweet -is:reply lang:en
[INFO] - Saving tweets from 2021-12-31


In [14]:
for ceo in ceos:
    search_words = f"{ceo} -is:retweet -is:reply lang:en"
    filename_prefix = f"../Data/RawTweets/Ceos/{ceo}/"
    
    get_tweets(search_words, filename_prefix, start_date, end_date)


[INFO] - Retrieving tweets from 2021-12-31, for querry: Tim Cook -is:retweet -is:reply lang:en
[INFO] - Saving tweets from 2021-12-31
[INFO] - Retrieving tweets from 2021-12-31, for querry: Jeff Bezos -is:retweet -is:reply lang:en
[INFO] - Saving tweets from 2021-12-31
[INFO] - Retrieving tweets from 2021-12-31, for querry: Elon Musk -is:retweet -is:reply lang:en
[INFO] - Saving tweets from 2021-12-31


In [None]:
"""for company in companies:
    search_words = f"{company} -is:retweet -is:reply lang:en"
    filename_prefix = f"../Data/RawTweets/Companies/{company}/"
    
    get_tweets(search_words, filename_prefix, start_date, end_date)"""
