## Name: Deanna Schneider
The following three cells do set up for the work of collecting tweets.

In [1]:
# Imports
import tweepy
import csv
import pandas as pd
import json
import re
import datetime

In [3]:
# Saving access tokens in a separate csv file that will not be included with the git push so that my credentials stay local.
keys = pd.read_csv('keys.csv')

con_key = keys.con_key[0]
con_secret = keys.con_secret[0]
acc_token = keys.acc_token[0]
acc_secret = keys.acc_secret[0]


In [4]:
#Use tweepy.OAuthHandler to create an authentication using the given key and secret
auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)

#Connect to the Twitter API using the authentication
api = tweepy.API(auth)

## Code Rationale

The majority of the work of collecting and storing tweets is done by the following two functions. While I could have done a wrapper class for all of this, it didn't seem necessary, given that there were only two functions.

In [5]:
def get_tweets(number, hashtag):
    """
    Takes in the number of tweets to return and a single hashtag (without the hash)
    Returns a list of tweets, filtering out retweets and replies
    """

    #set the number needed
    num_needed = number
    tweet_list = []
    last_id = -1 # id of last tweet seen
    while len(tweet_list) < num_needed:
        try:
            #https://stackoverflow.com/questions/27941940/how-to-exclude-retweets-and-replies-in-a-search-api
            #we are only looking for text-based original tweets, so filtering out links, images and videos
            new_tweets = api.search(q = '%23' + hashtag + ' AND lang:en AND -filter:retweets AND -filter:replies AND  -Filter:Links AND -Filter:Media', count = 100,  tweet_mode='extended', max_id = str(last_id - 1))
        except tweepy.TweepError as e:
            print("Error", e)
            break
        else:
            if not new_tweets:
                print("Could not find any more tweets!")
                return tweet_list
                break
            tweet_list.extend(new_tweets)
            last_id = new_tweets[-1].id
    
    return tweet_list       




In [6]:
def write_Tweets(filename, tweet_list):
    """
    Takes in a filename and a tweet_list.
    Writes the tweets to a file
    """

    with open(filename,"w", newline="", encoding='utf-8') as tweets:
        tweets.write("AuthorID|Author.Screen_Name|Followers.Count|Friends.Count|Statuses.Count|Language|Created_At|Favorite_count|Text|Source|retweet_count|Hashtags\n")
        for t in tweet_list:
            hashtags = [h["text"] for h in t.entities["hashtags"]]

            tweets.write("%(author_id)s|%(authorscreename)s|%(followers_count)s|%(friends_count)s|%(statuses_count)s|%(language)s|%(created_at)s|%(favorite_count)s|%(text)s|%(source)s|%(retweetcount)s|%(hashtags)s\n" %
                                {'author_id': t.author.id_str,
                                     'authorscreename': t.author.screen_name.replace('|','-').replace('\n', ' ').replace('\r', ' '), 
                                     'followers_count': t.author.followers_count,
                                     'friends_count': t.author.friends_count,
                                     'statuses_count': t.author.statuses_count,
                                     'language': t.author.lang, 
                                     'created_at': t.created_at, 
                                     'favorite_count': t.favorite_count, 
                                     'text': t.full_text.replace('|','-').replace('\n', ' ').replace('\r', ' ').replace('"', '').replace("'", ""), 
                                     'source':t.source.replace('|','-').replace('\n', ' ').replace('\r', ' ').replace('"', '').replace("'", ""), 
                                     'retweetcount': t.retweet_count, 
                                     'hashtags': hashtags })


## Collecting Tweets
Tweets were collected at two points in time - October and December 2017. Initially, I was hard-coding the file name to just hashtag_static.txt. When I realized I needed to collect more tweets, I thought it wise to add a dynamic date component, so that I wouldn't accidentally overwrite previous files and so that I could track when the collection took place. The initial tweets are still stored in the repository with the static file name.

In [None]:
#MeToo - a passive voice tweet

#get the date
now = datetime.datetime.now()
#set the filename
filename = "metoo_static_%d_%d_%d.txt" %(now.year, now.month, now.day)

#get the metoo tweets
tweet_list = get_tweets(9000, 'metoo')
len(tweet_list)
write_Tweets(filename, tweet_list)


In [17]:
#TakeaKnee - an active voice tweet

#get the date
now = datetime.datetime.now()
#set the filename
filename = "takeaknee_static_%d_%d_%d.txt" %(now.year, now.month, now.day)
#get the takeaknee tweets
tweet_list = get_tweets(3159, 'takeaknee')
len(tweet_list)
write_Tweets(filename, tweet_list)
