In [None]:
# import modules
import os
import csv
import tweepy as tw
import pandas as pd

# pw.py contains your secret api keys
# these values should NEVER be shared with others
# this information could be use to hijack your account and post anything
# we're importing the values we need from the pw.py file; they are not encoded or protected
# if you compile this notebook and then share it with someone, they could access those values
# make SURE that you "restart the notebook kernel and clear all outputs" or redefine the values before sharing the notebook

from pw import tw_consumer_key, tw_consumer_secret, tw_access_token, tw_access_token_secret

In [None]:
# set up api connection
auth = tw.OAuthHandler(tw_consumer_key, tw_consumer_secret)
auth.set_access_token(tw_access_token, tw_access_token_secret)
api = tw.API(auth, wait_on_rate_limit = True)

## Note: Rate Limits

To protect their servers, Twitter limits the number and size of requests each account can make through the API.

The full details are here: https://developer.twitter.com/en/docs/twitter-api/v1/rate-limits

For our purposes today, I've already made some searches and saved them to csv files (detailed below). If you're working through this demo asynchronously, you'll have to collect your own tweets to work through.

For more in-depth projects, you'll likely want to utilize the Twitter streaming API which allows continuous queries over longer periods of time. (see an example here: http://docs.tweepy.org/en/latest/streaming_how_to.html).



# Tweepy Basic Search

The example below shows a typical search using Tweepy's Cursor method. In this example, we're searching for the first *1000 tweets* (as defined in the .items() method) that match our search criteria.

They are:
1. Contains the word 'snow' and is not a retweet (the query, "-q")
1. Are tagged as English
1. Occurred since Oct 20, 2020

The Tweepy search parameters and Twitter query are quite powerful and versatile. Here are some links to useful resources!

- Twitter's search queries syntax [https://developer.twitter.com/en/docs/labs/recent-search/guides/search-queries]
    - this goes into the -q argument of a tweepy call
- Tweepy's various search parameters [http://docs.tweepy.org/en/latest/api.html#search-methods]

In [None]:
# search for first 1000 tweets containing the term 'snow', ignoring retweets, in English, since Oct 20 2020
snow_tweets = tw.Cursor(api.search,
                         q = 'snow -filter:retweets',
                         lang = 'en',
                         since = '2020-10-20').items(1000)

# convert to list so we can manipulate it more easily
snow_list = list(snow_tweets)

In [None]:
# view the internal structure of a tweet entry in the list
tweet = snow_list[1]
print([tweet.text, tweet.user.id_str, tweet.user.name, tweet.id_str, tweet.created_at])

# entire structure
vars(snow_list[1])

##  Method 1: Saving Specific Information

The tweets returned from the API contain a wealth of information, and may be too detailed depending on your specific question.

If you know which fields you'd like to retain, you can save just those particular attributes as columns in a csv file as below:

In [None]:
## export certain information in tweets to csv file 

with open('./saved_searches/snow_tweets.csv', 'a') as output_file:
    csvWriter = csv.writer(output_file)
    csvWriter.writerow(['text','user.id_str','user','id_str','created_at'])
    # iterate over list of tweets and save text, user id, user name, tweet id, and creation time
    for tweet in snow_list:
        csvWriter.writerow([tweet.text, tweet.user.id_str, tweet.user.name, tweet.id_str, tweet.created_at])


## Method 2: Saving all .json information in a csv file

If you'd like to save all of the various attributes for each tweet, you can utilize the `_json` attribute of each tweepy object. .json is a widely used data structure for storing text. All of the attributes of a tweet (e.g., its content, author, time created, etc.) are stored in .json format in the tweet's `_json` attribute.

We'll convert these json entries into pandas dataframe representation, and then save the entire dataframe to a csv.




In [None]:
## convert entire tweet entry into pandas data frame (and save that)

# create empty list
rows = []

# loop over all the tweepy objects in our `snow_list`
for tweet in snow_list:
    # convert the json data in the tweepy object into a pandas dataframe row, and add to our list
    rows.append(pd.json_normalize(tweet._json))

# concatenate all of the individual entries
snow_tweets_df = pd.concat(rows, ignore_index = True)

# save the dataframe to a csv file for later
snow_tweets_df.to_csv('./saved_searches/snow_tweets_extensive.csv', index = False, quoting=csv.QUOTE_ALL)

## Additional Searches

The following contain additional searches that we'll be using for different purposes in the workshop.

Try interpreting the types of types we're filtering for based on the code below:

In [None]:
pillow_tweets = tw.Cursor(api.search,
                         q = 'pillow -filter:retweets',
                         lang = 'en').items(10000)

# convert to list so we can manipulate it more easily
pillow_list = list(pillow_tweets)

# export certain information in tweets to csv file 

with open('./saved_searches/pillow_tweets.csv', 'a') as output_file:
    csvWriter = csv.writer(output_file)
    csvWriter.writerow(['text','user.id_str','user','id_str','created_at'])
    # iterate over list of tweets and save text, user id, user name, tweet id, and creation time
    for tweet in pillow_list:
        csvWriter.writerow([tweet.text, tweet.user.id_str, tweet.user.name, tweet.id_str, tweet.created_at])


In [None]:
beach_tweets = tw.Cursor(api.search,
                        q = 'beach -filter:retweets',
                        lang = 'en',
                        geocode = '39.833,-98.583,1660mi').items(5000)

beach_list_full = list(beach_tweets)

# filter out only those tweets that have specific geocodes attached
beach_list = [t for t in beach_list_full if t.geo is not None]

## convert entire tweet entry into pandas data frame (and save that)

# create empty list
rows = []

# loop over all the tweepy objects in our `beach_list`
for tweet in beach_list:
    # convert the json data in the tweepy object into a pandas dataframe row, and add to our list
    rows.append(pd.json_normalize(tweet._json))

# concatenate all of the individual entries
beach_tweets_df = pd.concat(rows, ignore_index = True)

# save the dataframe to a csv file for later
beach_tweets_df.to_csv('./saved_searches/beach_tweets_extensive.csv', index = False, quoting=csv.QUOTE_ALL)

In [None]:
trump_mentions = tw.Cursor(api.search,
                          q = '"Donald Trump" -filter:retweets',
                          lang = 'en',
                          result_type = 'recent').items(500)
trump_list = list(trump_mentions)

biden_mentions = tw.Cursor(api.search,
                          q = '"Joe Biden" -filter:retweets',
                          lang = 'en',
                          result_type = 'recent').items(500)
biden_list = list(biden_mentions)

In [None]:
# create empty list
rows = []

# loop over all the tweepy objects in our `trump_list`
for tweet in trump_list:
    # convert the json data in the tweepy object into a pandas dataframe row, and add to our list
    rows.append(pd.json_normalize(tweet._json))

# concatenate all of the individual entries
trump_tweets_df = pd.concat(rows, ignore_index = True)

# save the dataframe to a csv file for later
trump_tweets_df.to_csv('./saved_searches/trump_tweets_extensive.csv', index = False, quoting=csv.QUOTE_ALL)

In [None]:
# create empty list
rows = []

# loop over all the tweepy objects in our `biden_list`
for tweet in biden_list:
    # convert the json data in the tweepy object into a pandas dataframe row, and add to our list
    rows.append(pd.json_normalize(tweet._json))

# concatenate all of the individual entries
biden_tweets_df = pd.concat(rows, ignore_index = True)

# save the dataframe to a csv file for later
biden_tweets_df.to_csv('./saved_searches/biden_tweets_extensive.csv', index = False, quoting=csv.QUOTE_ALL)