## This notebook downloads tweets for use in generative tweet project.##

In [2]:
# Run this cell to set up your notebook
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import json
from pprint import pprint
import tweepy
from pathlib import Path

# Ensure that Pandas shows at least 280 characters in columns, so we can see full tweets
pd.set_option('max_colwidth', 280)

%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns
sns.set()
sns.set_context("talk")
import re

## Utility Functions: ##

        These functions are used to import data from Twitter api. They use the given keys from keys.json,
        which are specific to every twitter dev account. 

In [3]:
def load_keys(path):
    """Loads your Twitter authentication keys from a file on disk.
    
    Args:
        path (str): The path to your key file.  The file should
          be in JSON format and look like this (but filled in):
            {
                "consumer_key": "<your Consumer Key here>",
                "consumer_secret":  "<your Consumer Secret here>",
                "access_token": "<your Access Token here>",
                "access_token_secret": "<your Access Token Secret here>"
            }
    
    Returns:
        dict: A dictionary mapping key names (like "consumer_key") to
          key values.
    """
    import logging
    from tweepy import TweepError
    
    with open(path) as f:
        keys = json.load(f)
    
    try:
        auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
        auth.set_access_token(keys["access_token"], keys["access_token_secret"])
        api = tweepy.API(auth)
    except TweepError as e:
        logging.warning("There was a Tweepy error. Double check your API keys and try again.")
        logging.warning(e)
        
    return keys

In [4]:
def download_recent_tweets_by_user(user_account_name, keys):
    """Downloads tweets by one Twitter user.

    Args:
        user_account_name (str): The name of the Twitter account
          whose tweets will be downloaded.
        keys (dict): A Python dictionary with Twitter authentication
          keys (strings), like this (but filled in):
            {
                "consumer_key": "<your Consumer Key here>",
                "consumer_secret":  "<your Consumer Secret here>",
                "access_token": "<your Access Token here>",
                "access_token_secret": "<your Access Token Secret here>"
            }

    Returns:
        list: A list of Dictonary objects, each representing one tweet.
    """
    
    # Authenticate the api.
    auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
    auth.set_access_token(keys["access_token"], keys["access_token_secret"])
    api = tweepy.API(auth)
    
    tweets = [t._json for t in tweepy.Cursor(api.user_timeline, id=user_account_name, 
                                             tweet_mode='extended').items()]
    
    return tweets

In [5]:
def save_tweets(tweets, path):
    """Saves a list of tweets to a file in the local filesystem.
    
    This function makes no guarantee about the format of the saved
    tweets, **except** that calling load_tweets(path) after
    save_tweets(tweets, path) will produce the same list of tweets
    and that only the file at the given path is used to store the
    tweets.  (That means you can implement this function however
    you want, as long as saving and loading works!)

    Args:
        tweets (list): A list of tweet objects (of type Dictionary) to
          be saved.
        path (str): The place where the tweets will be saved.

    Returns:
        None
    """
    with open(path, "w") as f:        
        json.dump(tweets, f)
        
    return

In [6]:
def load_tweets(path):
    """Loads tweets that have previously been saved.
    
    Calling load_tweets(path) after save_tweets(tweets, path)
    will produce the same list of tweets.
    
    Args:
        path (str): The place where the tweets were be saved.

    Returns:
        list: A list of Dictionary objects, each representing one tweet.
    """
    
    with open(path, "r") as f:
        tweets = json.load(f)
        
    return tweets

In [14]:
def get_tweets_with_cache(user_account_name, keys_path):
    """Get recent tweets from one user, loading from a disk cache if available.
    
    The first time you call this function, it will download tweets by
    a user.  Subsequent calls will not re-download the tweets; instead
    they'll load the tweets from a save file in your local filesystem.
    All this is done using the functions you defined in the previous cell.
    This has benefits and drawbacks that often appear when you cache data:
    
    +: Using this function will prevent extraneous usage of the Twitter API.
    +: You will get your data much faster after the first time it's called.
    -: If you really want to re-download the tweets (say, to get newer ones,
       or because you screwed up something in the previous cell and your
       tweets aren't what you wanted), you'll have to find the save file
       (which will look like <something>_recent_tweets.pkl) and delete it.
    
    Args:
        user_account_name (str): The Twitter handle of a user, without the @.
        keys_path (str): The path to a JSON keys file in your filesystem.
    """
    
    assert user_account_name.find('@', 0) == -1, "Use twitter handle without the leading @."
    
    file_path = 'data/Obama_tweets.json'
    if not Path(file_path).is_file():
        
        keys = load_keys(keys_path)
        tweets = download_recent_tweets_by_user(user_account_name, keys)
        save_tweets(tweets, file_path)
        
    else:
        tweets = load_tweets(file_path)
        
    return tweets

In [8]:
def tweets_to_list(tweets, ID, TEXT, RETWEET):
    """Extracts the id, text, and retweet status of tweets.
    
    This function will take in a list of dictionaries and
    extract the id, text, and retweet fields, putting them 
    in their respective lists, passed in as arguments. We 
    should note that text can be under multiple keys, namely
    text or full text. The ith element in ID, TEXT, AND RETWEET
    corresponds to a single tweet.
    
    Args:
        tweets - list of dictionary objects containing tweets.
        ID - list to place the respective tweet id's.
        TEXT - list to place the respective text.
        RETWEET - list to place the respective retweet status.
        
    Returns:
        tuple - (ID, TEXT, RETWEET)
    """
    
    for tweet in tweets:
        ID.append(tweet['id'])
        RETWEET.append(tweet['retweeted'])
        try:
            text = tweet['text']
        except KeyError:
            text = tweet['full_text']
        TEXT.append(text)
    return ID, TEXT, RETWEET


In [9]:
def to_dataframe(tweets_lst):
    """Creates a dataframe from the corresponding lists.
    
    Args:
        tweets_lst - list of .json files containing tweets
        
    Returns:
        Pandas DataFrame object with columns: [ID, TEXT, RETWEET]
            + ID - The unique id of the tweet. type(integer)
            + Text - The actual text of the tweet. type(string)
            + Retweet - A boolean value indicating wether or not the tweet is a retweet. type(boolean)
    """
    ID = []
    TEXT = []
    RETWEET = []
    
    for file in tweets_lst:
        ID, TEXT, RETWEET = tweets_to_list(file, ID, TEXT, RETWEET)

    data = pd.DataFrame({
        'ID': ID,
        'RETWEET': RETWEET,
        'TEXT': TEXT
    })
    return data

In [10]:
def make_unique(data):
    """Drops repeat tweets and retweets.
    
    This function takes in a pandas dataframe, and returns
    a new dataframe that has only uniqe tweets. Also prints
    out the size of the dataframe for visual confirmation of
    functions validity.
    
    Args:
        data - Pandas DataFrame of tweets, with possible repeats.
    
    Returns:
        Pandas DataFrame of the same shape with no repeats or retweets.
    """
    
    unique_tweet_df = data.drop_duplicates(subset='ID', keep='first')
    unique_tweet_df = unique_tweet_df[~unique_tweet_df['RETWEET']]
    
    print("Number of unique tweets: ", unique_tweet_df.shape[0])
    print("Number of False retweets: ", unique_tweet_df['RETWEET'].value_counts().sum())
    print("Number of Unique id's: ", unique_tweet_df['ID'].value_counts().sum())
    
    return unique_tweet_df
    

In [11]:
def filter_string(data):
    """Take the \n and weblinks out of the tweets
    
    Filters the string. 
    Also drops any rows with the empty string after filtering.
    
    Args:
        pandas dataframe
    
    Retruns:
        new pandas datafram with filtered string.
    """
    
    data['TEXT'] = (
                    data['TEXT']
#                     .str.replace("(go to|register|read more):\s{0,}(https\S*)", "\\2", flags=re.IGNORECASE)
                    .str.replace("(https\S*)", "[URL]")
                    .str.replace("\\n", "")
                )
    data = data[~((data['TEXT'] == '') | (data['TEXT'] == '[URL]'))]
    return data



## Executable Section ##

        Here we execute the code above, and actually import the data, if it has not been imported already.
        We also should note, that this code, will not run, unless keys.json is set up with appropriate 
        keys from twitter developer account.
        
        Imports the data from .json format, and returns a pandas DataFrame with the structure:
        
        Columns: [ID, TEXT, RETWEET]
            + ID - The unique id of the tweet. type(integer)
            + Text - The actual text of the tweet. type(string)
            + Retweet - A boolean value indicating wether or not the tweet is a retweet. type(boolean)
            

In [11]:
my_zip = zipfile.ZipFile('data/old_trump_tweets.json.zip', 'r')
with my_zip.open("old_trump_tweets.json", "r") as f:
    old_trump_tweets = json.load(f)
print("Number of tweets downloaded/imported:", len(old_trump_tweets))  

Number of tweets downloaded/imported: 6738


In [12]:
with open('data/realdonaldtrump_recent_tweets.json') as f:
    medium_age_trump_tweets = json.load(f)
print("Number of tweets downloaded/imported:", len(medium_age_trump_tweets))  

Number of tweets downloaded/imported: 3214


In [26]:
key_file = 'keys_copy.json'
new_trump_tweets = get_tweets_with_cache("BarackObama", key_file)
print("Number of tweets downloaded/imported:", len(new_trump_tweets))

Number of tweets downloaded/imported: 3217


In [14]:
with open('data/Tweets_save_file.json') as f:
    trump_tweets = json.load(f)
print("Number of tweets downloaded/imported:", len(medium_age_trump_tweets)) 

Number of tweets downloaded/imported: 3214


In [15]:
# Make the dataframe from the .json files of tweets

tweets_lst = [old_trump_tweets, medium_age_trump_tweets, new_trump_tweets, trump_tweets]
tweet_df = to_dataframe(tweets_lst)
print("shape: ", tweet_df.shape)
tweet_df.head(10)   


shape:  (14709, 3)


Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. \n#CrookedHillary https://t.co/wjsl8ITVvk
1,786201435486781440,False,Very little pick-up by the dishonest media of incredible information provided by WikiLeaks. So dishonest! Rigged system!
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about the things she will do but she has been there for 30 years - why didn't she do them?
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;… https://t.co/t9XM9wFDZI
4,786007502639038464,False,"Join me Thursday in Florida &amp; Ohio!\nWest Palm Beach, FL at noon:\nhttps://t.co/jwbZnQhxg9\nCincinnati, OH this 7:30pm:\nhttps://t.co/5w2UhalPIx"
5,785979396620324865,False,"Wow, @CNN Town Hall questions were given to Crooked Hillary Clinton in advance of big debates against Bernie Sanders. Hillary &amp; CNN FRAUD!"
6,785957064480653313,False,Thank you Texas! If you haven't registered to VOTE- today is your last day. Go to: https://t.co/HfihPEA3Sp &amp; get ou… https://t.co/gxbDRD4x2k
7,785947219216125952,False,VOTER REGISTRATION DEADLINES TODAY. You can register now at: https://t.co/HfihPEA3Sp and get out to… https://t.co/LlFdF0DRX4
8,785913754194104320,False,DON'T LET HER FOOL US AGAIN. https://t.co/3QSoADFh7S
9,785910334427058177,False,"Crooked's State Dept gave special attention to ""Friends of Bill"" after the Haiti Earthquake. Unbelievable! https://t.co/opP2l8ln0J"


In [28]:
# Make the dataframe from the .json files of tweets

tweets_lst = [new_trump_tweets]
tweet_df = to_dataframe(tweets_lst)
print("shape: ", tweet_df.shape)
tweet_df.head(10)   


shape:  (3217, 3)


Unnamed: 0,ID,RETWEET,TEXT
0,1117886698568830976,False,"Notre Dame is one of the world’s great treasures, and we’re thinking of the people of France in your time of grief. It’s in our nature to mourn when we see history lost – but it’s also in our nature to rebuild for tomorrow, as strong as we can. https://t.co/SpMEvv1BzB"
1,1117498965417656321,False,"Congratulations, Tiger! To come back and win the Masters after all the highs and lows is a testament to excellence, grit, and determination."
2,1116840506506514433,False,"Another good story worth sharing: From one ""kid from Akron"" to a new generation of Akron kids, some remarkable early achievements at @IPROMISESchool. Great work, @KingJames—and even better work by those students. Proud to be a witness to their success: https://t.co/ctWfKnA5GL"
3,1116130925396602880,False,"From a big NBA fan, congrats to future Hall of Famers Dwyane Wade and Dirk Nowitzki—not just all-time greats but class acts, too."
4,1114517185483898880,False,"In just a few minutes, I’m taking the stage at the @ObamaFoundation Town Hall Europe for a conversation about how to best support civic leaders carrying out good work across the continent. I hope you’ll tune in live at https://t.co/swMRHuB5Y4. https://t.co/Ds4I8wFFM1"
5,1114202323767701504,False,A voice everybody should hear. https://t.co/0u7HUlQ8id
6,1113884881807990789,False,Here’s a story about people doing good that's worth sharing: @BradPaisley and his wife @Kimwilliamspais saw food insecurity in Nashville and decided to do something about it. They just broke ground on a free grocery store to help families in need: https://t.co/o6pcAApZsm
7,1113518349483421696,False,"Great to see Chicago’s historic mayoral race between two highly qualified candidates. Congrats to our next mayor, Lori Lightfoot—and Toni Preckwinkle campaigned hard and did us proud. I know that with our city’s heart and Lori’s leadership, Chicago’s best days are still ahead."
8,1113065751748009984,False,"Valerie is one of my oldest friends and advisors – she was by my side when I first decided to run for office and for every major moment of the presidency. @ValerieJarrett's voice has often inspired me and I know “Finding My Voice” will inspire others to lift their voices, too..."
9,1110610860513349633,False,"Last night I had the chance to meet with first-term Democrats in Congress—it's a young, diverse class, stocked with a bunch of my campaign and administration alums who’ve taken the torch. This group is going to be driving progress for a long time to come. https://t.co/W5dDlDtNzE"


In [16]:
# Filter out any Duplicate tweets, and any retweets.

unique_tweet_df = make_unique(tweet_df)
unique_tweet_df.head()


Number of unique tweets:  10706
Number of False retweets:  10706
Number of Unique id's:  10706


Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. \n#CrookedHillary https://t.co/wjsl8ITVvk
1,786201435486781440,False,Very little pick-up by the dishonest media of incredible information provided by WikiLeaks. So dishonest! Rigged system!
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about the things she will do but she has been there for 30 years - why didn't she do them?
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;… https://t.co/t9XM9wFDZI
4,786007502639038464,False,"Join me Thursday in Florida &amp; Ohio!\nWest Palm Beach, FL at noon:\nhttps://t.co/jwbZnQhxg9\nCincinnati, OH this 7:30pm:\nhttps://t.co/5w2UhalPIx"


In [17]:
unique_tweet_df.to_csv(path_or_buf="./data/original_tweets.csv", index=False)

Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. \n#CrookedHillary https://t.co/wjsl8ITVvk
1,786201435486781440,False,Very little pick-up by the dishonest media of incredible information provided by WikiLeaks. So dishonest! Rigged system!
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about the things she will do but she has been there for 30 years - why didn't she do them?
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;… https://t.co/t9XM9wFDZI
4,786007502639038464,False,"Join me Thursday in Florida &amp; Ohio!\nWest Palm Beach, FL at noon:\nhttps://t.co/jwbZnQhxg9\nCincinnati, OH this 7:30pm:\nhttps://t.co/5w2UhalPIx"


In [17]:
# Filter out the \n and weblinks in the tweets.

unique_tweet_df = filter_string(unique_tweet_df)
print("Shape: ", unique_tweet_df.shape)
unique_tweet_df.head()

Shape:  (10622, 3)


Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. #CrookedHillary [URL]
1,786201435486781440,False,Very little pick-up by the dishonest media of incredible information provided by WikiLeaks. So dishonest! Rigged system!
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about the things she will do but she has been there for 30 years - why didn't she do them?
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;… [URL]
4,786007502639038464,False,"Join me Thursday in Florida &amp; Ohio!West Palm Beach, FL at noon:[URL]Cincinnati, OH this 7:30pm:[URL]"


In [18]:
# Final pandas dataframe
print("Shape: ", unique_tweet_df.shape)
unique_tweet_df.head(10)

Shape:  (10622, 3)


Unnamed: 0,ID,RETWEET,TEXT
0,786204978629185536,False,PAY TO PLAY POLITICS. #CrookedHillary [URL]
1,786201435486781440,False,Very little pick-up by the dishonest media of incredible information provided by WikiLeaks. So dishonest! Rigged system!
2,786189446274248704,False,Crooked Hillary Clinton likes to talk about the things she will do but she has been there for 30 years - why didn't she do them?
3,786054986534969344,False,Thank you Florida- a MOVEMENT that has never been seen before and will never be seen again. Lets get out &amp;… [URL]
4,786007502639038464,False,"Join me Thursday in Florida &amp; Ohio!West Palm Beach, FL at noon:[URL]Cincinnati, OH this 7:30pm:[URL]"
5,785979396620324865,False,"Wow, @CNN Town Hall questions were given to Crooked Hillary Clinton in advance of big debates against Bernie Sanders. Hillary &amp; CNN FRAUD!"
6,785957064480653313,False,Thank you Texas! If you haven't registered to VOTE- today is your last day. Go to: [URL] &amp; get ou… [URL]
7,785947219216125952,False,VOTER REGISTRATION DEADLINES TODAY. You can register now at: [URL] and get out to… [URL]
8,785913754194104320,False,DON'T LET HER FOOL US AGAIN. [URL]
9,785910334427058177,False,"Crooked's State Dept gave special attention to ""Friends of Bill"" after the Haiti Earthquake. Unbelievable! [URL]"


In [21]:
# Save the dataframe as a csv for use in the model.
unique_tweet_df.to_csv(path_or_buf="./data/tweet_data.csv", index=False)