# Project 2 - GeoTweet

@Author Jeffery Brown (daddyjab)<br>
@Date 3/25/19<br>
@File ETL_for_GeoTweet


In [1]:
# Dependencies
import tweepy
import json
import time

# Twitter API Keys
import os
from api_config import *

# Other
import pandas as pd
from datetime import datetime 
from dateutil import tz
import requests

from pprint import pprint

# Twitter API
# key_twitter_tweetquestor_consumer_api_key
# key_twitter_tweetquestor_consumer_api_secret_key
# key_twitter_tweetquestor_access_token
# key_twitter_tweetquestor_access_secret_token

# Yahoo! API
# key_yahoo_infoquestor_app_id
# key_yahoo_infoquestor_client_id
# key_yahoo_infoquestor_client_secret

# Flickr API
# key_flicker_infoquestor_key
# key_flicker_infoquestor_secret



In [2]:
# RATE LIMIT STUFF
# Just look for RateLimitError response from Tweepy API call - see: http://docs.tweepy.org/en/v3.5.0/api.html


In [3]:
# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(key_twitter_tweetquestor_consumer_api_key, key_twitter_tweetquestor_consumer_api_secret_key)
auth.set_access_token(key_twitter_tweetquestor_access_token, key_twitter_tweetquestor_access_secret_token)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

In [340]:
def api_calls_remaining( a_api, a_type = "place"):
# Return the number of Twitter API calls remaining
# for the specified API type:
# 'place': Top 10 trending topics for a WOEID
# 'closest': Locations near a specificed lat/long for which Twitter has trending topic info
# 'available': Locations for which Twitter has topic info

    # Get Twitter rate limit information using the Tweepy API
    rate_limits = a_api.rate_limit_status()
    
    # Focus on the rate limits for trends calls
    trends_limits = rate_limits['resources']['trends']
    
    # Return the remaining requests available for the
    # requested type of trends query (or "" if not a valid type)
    try:
        remaining = trends_limits[ f"/trends/{a_type}" ]['remaining']
    except:
        return ""
        
    return remaining

In [341]:
def api_time_before_reset( a_api, a_type = "place"):
# Return the number of minutes until the Twitter API is reset
# for the specified API type:
# 'place': Top 10 trending topics for a WOEID
# 'closest': Locations near a specificed lat/long for which Twitter has trending topic info
# 'available': Locations for which Twitter has topic info

    # Get Twitter rate limit information using the Tweepy API
    rate_limits = a_api.rate_limit_status()
    
    # Focus on the rate limits for trends calls
    trends_limits = rate_limits['resources']['trends']
    
    
    # Return the reset time for the
    # requested type of trends query (or "" if not a valid type)
    try:
        reset_ts = trends_limits[ f"/trends/{a_type}" ]['reset']
    except:
        return -1
        
    # Calculate the remaining time using datetime methods to
    # get the UTC time from the POSIX timestamp
    reset_utc = datetime.utcfromtimestamp(reset_ts)
    
    # Current the current time
    current_utc = datetime.utcnow()
    
    # Calculate the number of seconds remaining,
    # Assumption: reset time will be >= current time
    time_before_reset = (reset_utc - current_utc).total_seconds() / 60.0
    
    # Tell the datetime object that it's in UTC time zone since 
    # datetime objects are 'naive' by default
    reset_utc = reset_utc.replace(tzinfo = tz.tzutc() )
    
    # Convert time zone
    reset_local = reset_utc.astimezone( tz.tzlocal() )

    # Tell the datetime object that it's in UTC time zone since 
    # datetime objects are 'naive' by default
    current_utc = current_utc.replace(tzinfo = tz.tzutc() )
    
    # Convert time zone
    current_local = current_utc.astimezone( tz.tzlocal() )
    print(f"Time Before Reset: {time_before_reset:.1f}: Reset Time: {reset_local.strftime('%Y-%m-%d %H:%M:%S')}, Local Time: {current_local.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Return the time before reset (in minutes)
    return time_before_reset

In [344]:
calls_remaining = api_calls_remaining( api, "place" )
calls_remaining

75

In [343]:
time_before_reset = api_time_before_reset( api, "place")
time_before_reset

Time Before Reset: 15.0: Reset Time: 2019-03-26 23:16:06, Local Time: 2019-03-26 23:01:04


15.020642466666667

In [328]:
def get_trends_available_to_df( ):
# Get locations that have trends data from a api.trends_available() call,
# flatten the data, and create a dataframe

    # Obtain the WOEID locations for which Twitter Trends info is available
    try:
        trends_avail = api.trends_available()
        
    except TweepError as e:
        # No top trends info available for this WOEID, return False
        print(f"Error obtaining top trends for WOEID {a_woeid}: ", e)
        return False
    
    # Import trend availability info into a dataframe
    trends_avail_df = pd.DataFrame.from_dict(trends_avail, orient='columns')

    # Retain only locations in the U.S.
    trends_avail_df = trends_avail_df[ (trends_avail_df['countryCode'] == "US") ]
        
    # Reset the index
    trends_avail_df.reset_index(drop=True, inplace=True)

    # Flatten the dataframe by unpacking the placeType column information into separate columns
    trends_avail_df['twitter_type'] = trends_avail_df['placeType'].map( lambda x: x['name'])

    # Remove unneeded fields
    trends_avail_df.drop(['placeType', 'url' ], axis='columns' , inplace = True)

    # Rename the fields
    trends_avail_df.rename(columns={
        'woeid': 'woeid',
        'country': 'twitter_country',
        'countryCode': 'tritter_country_code',
        'name': 'twitter_name',
        'parentid': 'twitter_parentid' }, inplace=True)
    
    return trends_avail_df

In [329]:
# Flatten the Twitter Trends results and populate in a Dataframe
trends_available_df = get_trends_available_to_df()

In [330]:
trends_available_df.head()

Unnamed: 0,twitter_country,tritter_country_code,twitter_name,twitter_parentid,woeid,twitter_type
0,United States,US,Albuquerque,23424977,2352824,Town
1,United States,US,Atlanta,23424977,2357024,Town
2,United States,US,Austin,23424977,2357536,Town
3,United States,US,Baltimore,23424977,2358820,Town
4,United States,US,Baton Rouge,23424977,2359991,Town


In [240]:
def get_location_info( a_woeid ):
# Use Flickr API call to get location information associated with a Yahoo! WOEID
# Note: Yahoo! no longer supports this type of lookup! :(

    # Setup the Flickr API base URL
    flickr_api_base_url = f"https://api.flickr.com/services/rest/?method=flickr.places.getInfo&api_key={key_flicker_infoquestor_key}&format=json&nojsoncallback=1&woe_id="

    # Populate the WOEID and convert to string format
    woeid_to_search = str(a_woeid)
    
    # Build the full URL for API REST request
    flickr_api_url = flickr_api_base_url + woeid_to_search

    try:
        # Get the REST response, which will be in JSON format
        response = requests.get(url=flickr_api_url)
        
    except requests.exceptions.RequestException as e:
        print("Error obtaining location information for WOEID {a_woeid}: ", e)
        return False
    
    # Parse the json
    location_data = response.json()
    
    # Check for failure to locate the information
    if (location_data['stat'] == 'fail'):
        print(f"Error finding location WOEID {a_woeid}: {location_data['message']}")
        
        
    #pprint(location_data)
    
    # Return just a useful subset of the location info as flattened dictionary
    key_location_info = {}
    
    # Basic information that should be present for any location
    try:
        key_location_info.update( {
            'woeid': int(location_data['place']['woeid']),
            'name_woe': location_data['place']['woe_name'],
            'name_full': location_data['place']['name'],
            'name_only': location_data['place']['name'].split(",")[0].strip(),
            'place_type': location_data['place']['place_type'],
            'latitude': float(location_data['place']['latitude']),
            'longitude': float(location_data['place']['longitude']),
        })
                
    except:
        print("Error - basic location information not returned for WOEID{a_woeid}: ", sys.exc_info()[0])
    
    # Timezone associated with the location - if available
    try:
        key_location_info.update( {
            'timezone': location_data['place']['timezone']  
        })
        
    except:
        key_location_info.update( {
            'timezone': None
        })
        
    # County associated with the location - if available
    try:
        key_location_info.update( {
            'county_name': location_data['place']['county']['_content'],
            'county_name_only': location_data['place']['county']['_content'].split(",")[0].strip(),
            'county_woeid': int(location_data['place']['county']['woeid']),
        })
    except:
        key_location_info.update( {
            'county_name': None,
            'county_name_only': None,
            'county_woeid': None,
        })
        
    # State associated with the location - if available
    try:
        key_location_info.update( {
            'state_name': location_data['place']['region']['_content'],
            'state_name_only': location_data['place']['region']['_content'].split(",")[0].strip(),
            'state_woeid': int(location_data['place']['region']['woeid']),
        })
    except:
        key_location_info.update( {
            'state_name': None,
            'state_name_only': None,
            'state_woeid': None,
        })
        
    # Country associated with the location - if available
    try:
        key_location_info.update( {
            'country_name': location_data['place']['country']['_content'],
            'country_name_only': location_data['place']['country']['_content'].split(",")[0].strip(),
            'country_woeid': int(location_data['place']['country']['woeid']),
        })
    except:
        key_location_info.update( {
            'country_name': None,
            'country_name_only': None,
            'country_woeid': None, 
        })
    
    return key_location_info

In [345]:
# Use the get_location_info() function to add location info (from Flickr)
# for each location (Twitter WOEID) that has trend info
loc_info_list =  list( trends_available_df['woeid'][0:2].apply( get_location_info ) )

In [346]:
print(loc_info_list)

[{'woeid': 2352824, 'name_woe': 'Albuquerque', 'name_full': 'Albuquerque, New Mexico, United States', 'name_only': 'Albuquerque', 'place_type': 'locality', 'latitude': 35.105, 'longitude': -106.647, 'timezone': 'America/Denver', 'county_name': 'Bernalillo County, New Mexico, United States', 'county_name_only': 'Bernalillo County', 'county_woeid': 12589279, 'state_name': 'New Mexico, United States', 'state_name_only': 'New Mexico', 'state_woeid': 2347590, 'country_name': 'United States', 'country_name_only': 'United States', 'country_woeid': 23424977}, {'woeid': 2357024, 'name_woe': 'Atlanta', 'name_full': 'Atlanta, Georgia, United States', 'name_only': 'Atlanta', 'place_type': 'locality', 'latitude': 33.763, 'longitude': -84.423, 'timezone': 'America/New_York', 'county_name': 'Fulton County, Georgia, United States', 'county_name_only': 'Fulton County', 'county_woeid': 12587929, 'state_name': 'Georgia, United States', 'state_name_only': 'Georgia', 'state_woeid': 2347569, 'country_name':

In [347]:
# Create a DataFrame from the location info list
loc_info_df = pd.DataFrame.from_dict(loc_info_list)

In [348]:
loc_info_df

Unnamed: 0,country_name,country_name_only,country_woeid,county_name,county_name_only,county_woeid,latitude,longitude,name_full,name_only,name_woe,place_type,state_name,state_name_only,state_woeid,timezone,woeid
0,United States,United States,23424977,"Bernalillo County, New Mexico, United States",Bernalillo County,12589279,35.105,-106.647,"Albuquerque, New Mexico, United States",Albuquerque,Albuquerque,locality,"New Mexico, United States",New Mexico,2347590,America/Denver,2352824
1,United States,United States,23424977,"Fulton County, Georgia, United States",Fulton County,12587929,33.763,-84.423,"Atlanta, Georgia, United States",Atlanta,Atlanta,locality,"Georgia, United States",Georgia,2347569,America/New_York,2357024


In [349]:
# Merge the Twitter trend location available dataframe with the
# location info dataframe to create a master list of all
# Twitter Trend locations and associated location information
twitter_trend_locations_df = trends_available_df.merge(loc_info_df, how='inner', on='woeid')

In [350]:
twitter_trend_locations_df

Unnamed: 0,twitter_country,tritter_country_code,twitter_name,twitter_parentid,woeid,twitter_type,country_name,country_name_only,country_woeid,county_name,...,latitude,longitude,name_full,name_only,name_woe,place_type,state_name,state_name_only,state_woeid,timezone
0,United States,US,Albuquerque,23424977,2352824,Town,United States,United States,23424977,"Bernalillo County, New Mexico, United States",...,35.105,-106.647,"Albuquerque, New Mexico, United States",Albuquerque,Albuquerque,locality,"New Mexico, United States",New Mexico,2347590,America/Denver
1,United States,US,Atlanta,23424977,2357024,Town,United States,United States,23424977,"Fulton County, Georgia, United States",...,33.763,-84.423,"Atlanta, Georgia, United States",Atlanta,Atlanta,locality,"Georgia, United States",Georgia,2347569,America/New_York


In [306]:
def get_trends_for_loc( a_woeid ):
# Get top Twitter trending tweets for a location specified by a WOEID,
# flatten the data, and return it as a list of dictionaries

    # Import trend availability info into a dataframe
    try:
        top_trends = api.trends_place( a_woeid )[0]
        
    except TweepError as e:
        # No top trends info available for this WOEID, return False
        print(f"Error obtaining top trends for WOEID {a_woeid}: ", e)
        return False
    
    #pprint(top_trends)
    
    # Repeat some information that is common for all elements in the trends list
    common_info = {}
        
    # Basic information that should be present for any location
    # 'as_of': '2019-03-26T21:22:42Z',
    # 'created_at': '2019-03-26T21:17:18Z',
    # 'locations': [{'name': 'Atlanta', 'woeid': 2357024}]
    try:
        common_info.update( {
            'woeid': int(top_trends['locations'][0]['woeid']),
            'twitter_name': top_trends['locations'][0]['name'],
            'twitter_created_at': top_trends['created_at'],
            'twitter_as_of': top_trends['as_of']
        })
                
    except:
        print("Error - basic location information not returned for WOEID{a_woeid}: ", sys.exc_info()[0])
   
    # Loop through all of the trends and store in an array of dictionary elements
    # 'name': 'Jussie Smollett'
    # 'promoted_content': None
    # 'query': '%22Jussie+Smollett%22'
    # 'tweet_volume': 581331
    # 'url': 'http://twitter.com/search?q=%22Jussie+Smollett%22'

    # Return the trends as an array of flattened dictionaries
    trend_info = []

    for ti in top_trends['trends']:
        
        # Put the trend info into a dictionary, starting with the common info
        this_trend = common_info.copy()
        
        # Timezone associated with the location - if available
        try:
            this_trend.update( {
                'twitter_tweet_name': ti['name'],
                'twitter_tweet_promoted_content': ti['promoted_content'],
                'twitter_tweet_query': ti['query'],
                'twitter_tweet_volume': ti['tweet_volume'],
                'twitter_tweet_url': ti['url']
            })

        except:
            this_trend.update( {
                'twitter_tweet_name': None,
                'twitter_tweet_promoted_content': None,
                'twitter_tweet_query': None,
                'twitter_tweet_volume': None,
                'twitter_tweet_url': None
            })
            
        # Append this trend to the list
        trend_info.append( this_trend )
    
    return trend_info

In [316]:
# Get trend info for a WOEID location
t_info = get_trends_for_loc(2357024)

In [318]:
t_info_df = pd.DataFrame.from_dict(t_info)

In [319]:
t_info_df

Unnamed: 0,twitter_as_of,twitter_created_at,twitter_name,twitter_tweet_name,twitter_tweet_promoted_content,twitter_tweet_query,twitter_tweet_url,twitter_tweet_volume,woeid
0,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#ThisIsUs,,%23ThisIsUs,http://twitter.com/search?q=%23ThisIsUs,22189.0,2357024
1,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#SDLive,,%23SDLive,http://twitter.com/search?q=%23SDLive,80388.0,2357024
2,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Randall,,Randall,http://twitter.com/search?q=Randall,11992.0,2357024
3,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Beth,,Beth,http://twitter.com/search?q=Beth,31493.0,2357024
4,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Asuka,,Asuka,http://twitter.com/search?q=Asuka,63745.0,2357024
5,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#USMNT,,%23USMNT,http://twitter.com/search?q=%23USMNT,,2357024
6,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#THWg,,%23THWg,http://twitter.com/search?q=%23THWg,,2357024
7,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Special Olympics,,%22Special+Olympics%22,http://twitter.com/search?q=%22Special+Olympic...,202027.0,2357024
8,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Jussie Smollett,,%22Jussie+Smollett%22,http://twitter.com/search?q=%22Jussie+Smollett%22,813153.0,2357024
9,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#TemptationIsland,,%23TemptationIsland,http://twitter.com/search?q=%23TemptationIsland,,2357024


In [320]:
t_info = get_trends_for_loc(2352824)

In [323]:
t_info_df = t_info_df.append( pd.DataFrame.from_dict(t_info), ignore_index = True)

In [324]:
t_info_df

Unnamed: 0,twitter_as_of,twitter_created_at,twitter_name,twitter_tweet_name,twitter_tweet_promoted_content,twitter_tweet_query,twitter_tweet_url,twitter_tweet_volume,woeid
0,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#ThisIsUs,,%23ThisIsUs,http://twitter.com/search?q=%23ThisIsUs,22189.0,2357024
1,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#SDLive,,%23SDLive,http://twitter.com/search?q=%23SDLive,80388.0,2357024
2,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Randall,,Randall,http://twitter.com/search?q=Randall,11992.0,2357024
3,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Beth,,Beth,http://twitter.com/search?q=Beth,31493.0,2357024
4,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Asuka,,Asuka,http://twitter.com/search?q=Asuka,63745.0,2357024
5,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#USMNT,,%23USMNT,http://twitter.com/search?q=%23USMNT,,2357024
6,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#THWg,,%23THWg,http://twitter.com/search?q=%23THWg,,2357024
7,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Special Olympics,,%22Special+Olympics%22,http://twitter.com/search?q=%22Special+Olympic...,202027.0,2357024
8,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,Jussie Smollett,,%22Jussie+Smollett%22,http://twitter.com/search?q=%22Jussie+Smollett%22,813153.0,2357024
9,2019-03-27T03:47:05Z,2019-03-27T03:41:52Z,Atlanta,#TemptationIsland,,%23TemptationIsland,http://twitter.com/search?q=%23TemptationIsland,,2357024
