# Project 2 - GeoTweet

@Author Jeffery Brown (daddyjab)<br>
@Date 3/25/19<br>
@File ETL_for_GeoTweet


In [1]:
# Dependencies
import tweepy
import json
import time

# Twitter API Keys
import os
from api_config import *

# Other
import pandas as pd
from datetime import datetime 
from dateutil import tz
import requests

from pprint import pprint

# Twitter API
# key_twitter_tweetquestor_consumer_api_key
# key_twitter_tweetquestor_consumer_api_secret_key
# key_twitter_tweetquestor_access_token
# key_twitter_tweetquestor_access_secret_token

# Yahoo! API
# key_yahoo_infoquestor_app_id
# key_yahoo_infoquestor_client_id
# key_yahoo_infoquestor_client_secret

# Flickr API
# key_flicker_infoquestor_key
# key_flicker_infoquestor_secret



In [2]:
# RATE LIMIT STUFF
# Just look for RateLimitError response from Tweepy API call - see: http://docs.tweepy.org/en/v3.5.0/api.html


In [3]:
# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(key_twitter_tweetquestor_consumer_api_key, key_twitter_tweetquestor_consumer_api_secret_key)
auth.set_access_token(key_twitter_tweetquestor_access_token, key_twitter_tweetquestor_access_secret_token)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

In [4]:
def api_calls_remaining( a_api, a_type = "place"):
# Return the number of Twitter API calls remaining
# for the specified API type:
# 'place': Top 10 trending topics for a WOEID
# 'closest': Locations near a specificed lat/long for which Twitter has trending topic info
# 'available': Locations for which Twitter has topic info

    # Get Twitter rate limit information using the Tweepy API
    rate_limits = a_api.rate_limit_status()
    
    # Focus on the rate limits for trends calls
    trends_limits = rate_limits['resources']['trends']
    
    # Return the remaining requests available for the
    # requested type of trends query (or "" if not a valid type)
    try:
        remaining = trends_limits[ f"/trends/{a_type}" ]['remaining']
    except:
        return ""
        
    return remaining

In [5]:
def api_time_before_reset( a_api, a_type = "place"):
# Return the number of minutes until the Twitter API is reset
# for the specified API type:
# 'place': Top 10 trending topics for a WOEID
# 'closest': Locations near a specificed lat/long for which Twitter has trending topic info
# 'available': Locations for which Twitter has topic info

    # Get Twitter rate limit information using the Tweepy API
    rate_limits = a_api.rate_limit_status()
    
    # Focus on the rate limits for trends calls
    trends_limits = rate_limits['resources']['trends']
    
    #pprint(trends_limits)
    
    
    # Return the reset time for the
    # requested type of trends query (or "" if not a valid type)
    try:
        reset_ts = trends_limits[ f"/trends/{a_type}" ]['reset']
    except:
        return -1
        
    # Calculate the remaining time using datetime methods to
    # get the UTC time from the POSIX timestamp
    reset_utc = datetime.utcfromtimestamp(reset_ts)
    
    # Current the current time
    current_utc = datetime.utcnow()
    
    # Calculate the number of seconds remaining,
    # Assumption: reset time will be >= current time
    time_before_reset = (reset_utc - current_utc).total_seconds() / 60.0
    
    # Tell the datetime object that it's in UTC time zone since 
    # datetime objects are 'naive' by default
    reset_utc = reset_utc.replace(tzinfo = tz.tzutc() )
    
    # Convert time zone
    reset_local = reset_utc.astimezone( tz.tzlocal() )

    # Tell the datetime object that it's in UTC time zone since 
    # datetime objects are 'naive' by default
    current_utc = current_utc.replace(tzinfo = tz.tzutc() )
    
    # Convert time zone
    current_local = current_utc.astimezone( tz.tzlocal() )
    print(f"Time Before Reset: {time_before_reset:.1f}: Reset Time: {reset_local.strftime('%Y-%m-%d %H:%M:%S')}, Local Time: {current_local.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Return the time before reset (in minutes)
    return time_before_reset

In [6]:
calls_remaining = api_calls_remaining( api, "place" )
calls_remaining

75

In [7]:
time_before_reset = api_time_before_reset( api, "place")
time_before_reset

Time Before Reset: 15.0: Reset Time: 2019-03-26 12:09:39, Local Time: 2019-03-26 11:54:38


15.007081316666667

In [8]:
# Obtain the WOEID locations for which Twitter Trends info is available
results = api.trends_available()

In [9]:
results[1]

{'name': 'Winnipeg',
 'placeType': {'code': 7, 'name': 'Town'},
 'url': 'http://where.yahooapis.com/v1/place/2972',
 'parentid': 23424775,
 'country': 'Canada',
 'woeid': 2972,
 'countryCode': 'CA'}

In [232]:
def get_trends_available_to_df( a_results ):
# Accept the dictionary returned from a api.trends_available() call,
# flatten the data, and create a dataframe

    # Import trend availability info into a dataframe
    trends_avail_df = pd.DataFrame.from_dict(results, orient='columns')

    # Retain only locations in the U.S.
    trends_avail_df = trends_avail_df[ (trends_avail_df['countryCode'] == "US") ]
        
    # Reset the index
    trends_avail_df.reset_index(drop=True, inplace=True)

    # Flatten the dataframe by unpacking the placeType column information into separate columns
    trends_avail_df['twitter_type'] = trends_avail_df['placeType'].map( lambda x: x['name'])

    # Remove unneeded fields
    trends_avail_df.drop(['placeType', 'url' ], axis='columns' , inplace = True)

    # Rename the fields
    trends_avail_df.rename(columns={
        'woeid': 'woeid',
        'country': 'twitter_country',
        'countryCode': 'tritter_country_code',
        'name': 'twitter_name',
        'parentid': 'twitter_parentid' }, inplace=True)
    
    return trends_avail_df

In [236]:
# Flatten the Twitter Trends results and populate in a Dataframe
trends_available_df = get_trends_available_to_df( results )

In [237]:
trends_available_df.head()

Unnamed: 0,twitter_country,tritter_country_code,twitter_name,twitter_parentid,woeid,twitter_type
0,United States,US,Albuquerque,23424977,2352824,Town
1,United States,US,Atlanta,23424977,2357024,Town
2,United States,US,Austin,23424977,2357536,Town
3,United States,US,Baltimore,23424977,2358820,Town
4,United States,US,Baton Rouge,23424977,2359991,Town


In [240]:
def get_location_info( a_woeid ):
# Use Flickr API call to get location information associated with a Yahoo! WOEID
# Note: Yahoo! no longer supports this type of lookup! :(

    # Setup the Flickr API base URL
    flickr_api_base_url = f"https://api.flickr.com/services/rest/?method=flickr.places.getInfo&api_key={key_flicker_infoquestor_key}&format=json&nojsoncallback=1&woe_id="

    # Populate the WOEID and convert to string format
    woeid_to_search = str(a_woeid)
    
    # Build the full URL for API REST request
    flickr_api_url = flickr_api_base_url + woeid_to_search

    try:
        # Get the REST response, which will be in JSON format
        response = requests.get(url=flickr_api_url)
        
    except requests.exceptions.RequestException as e:
        print("Error obtaining location information for WOEID {a_woeid}: ", e)
        return False
    
    # Parse the json
    location_data = response.json()
    
    # Check for failure to locate the information
    if (location_data['stat'] == 'fail'):
        print(f"Error finding location WOEID {a_woeid}: {location_data['message']}")
        
        
    #pprint(location_data)
    
    # Return just a useful subset of the location info as flattened dictionary
    key_location_info = {}
    
    # Basic information that should be present for any location
    try:
        key_location_info.update( {
            'woeid': int(location_data['place']['woeid']),
            'name_woe': location_data['place']['woe_name'],
            'name_full': location_data['place']['name'],
            'name_only': location_data['place']['name'].split(",")[0].strip(),
            'place_type': location_data['place']['place_type'],
            'latitude': float(location_data['place']['latitude']),
            'longitude': float(location_data['place']['longitude']),
        })
                
    except:
        print("Error - basic location information not returned for WOEID{a_woeid}: ", sys.exc_info()[0])
    
    # Timezone associated with the location - if available
    try:
        key_location_info.update( {
            'timezone': location_data['place']['timezone']  
        })
        
    except:
        key_location_info.update( {
            'timezone': None
        })
        
    # County associated with the location - if available
    try:
        key_location_info.update( {
            'county_name': location_data['place']['county']['_content'],
            'county_name_only': location_data['place']['county']['_content'].split(",")[0].strip(),
            'county_woeid': int(location_data['place']['county']['woeid']),
        })
    except:
        key_location_info.update( {
            'county_name': None,
            'county_name_only': None,
            'county_woeid': None,
        })
        
    # State associated with the location - if available
    try:
        key_location_info.update( {
            'state_name': location_data['place']['region']['_content'],
            'state_name_only': location_data['place']['region']['_content'].split(",")[0].strip(),
            'state_woeid': int(location_data['place']['region']['woeid']),
        })
    except:
        key_location_info.update( {
            'state_name': None,
            'state_name_only': None,
            'state_woeid': None,
        })
        
    # Country associated with the location - if available
    try:
        key_location_info.update( {
            'country_name': location_data['place']['country']['_content'],
            'country_name_only': location_data['place']['country']['_content'].split(",")[0].strip(),
            'country_woeid': int(location_data['place']['country']['woeid']),
        })
    except:
        key_location_info.update( {
            'country_name': None,
            'country_name_only': None,
            'country_woeid': None, 
        })
    
    return key_location_info

In [241]:
# Get sample location information
city_loc_info = get_location_info(2464592)

In [242]:
pprint(city_loc_info)

{'country_name': 'United States',
 'country_name_only': 'United States',
 'country_woeid': 23424977,
 'county_name': 'Oklahoma County, OK, United States',
 'county_name_only': 'Oklahoma County',
 'county_woeid': 12589669,
 'latitude': 35.466,
 'longitude': -97.514,
 'name_full': 'Oklahoma City, Oklahoma, United States',
 'name_only': 'Oklahoma City',
 'name_woe': 'Oklahoma City',
 'place_type': 'locality',
 'state_name': 'Oklahoma, United States',
 'state_name_only': 'Oklahoma',
 'state_woeid': 2347595,
 'timezone': 'America/Chicago',
 'woeid': 2464592}


In [243]:
loc_info_list =  list( trends_available_df['woeid'][0:2].apply( get_location_info ) )

In [244]:
print(loc_info_list)

[{'woeid': 2352824, 'name_woe': 'Albuquerque', 'name_full': 'Albuquerque, New Mexico, United States', 'name_only': 'Albuquerque', 'place_type': 'locality', 'latitude': 35.105, 'longitude': -106.647, 'timezone': 'America/Denver', 'county_name': 'Bernalillo County, New Mexico, United States', 'county_name_only': 'Bernalillo County', 'county_woeid': 12589279, 'state_name': 'New Mexico, United States', 'state_name_only': 'New Mexico', 'state_woeid': 2347590, 'country_name': 'United States', 'country_name_only': 'United States', 'country_woeid': 23424977}, {'woeid': 2357024, 'name_woe': 'Atlanta', 'name_full': 'Atlanta, Georgia, United States', 'name_only': 'Atlanta', 'place_type': 'locality', 'latitude': 33.763, 'longitude': -84.423, 'timezone': 'America/New_York', 'county_name': 'Fulton County, Georgia, United States', 'county_name_only': 'Fulton County', 'county_woeid': 12587929, 'state_name': 'Georgia, United States', 'state_name_only': 'Georgia', 'state_woeid': 2347569, 'country_name':

In [245]:
loc_info_df = pd.DataFrame.from_dict(loc_info_list)

In [246]:
loc_info_df

Unnamed: 0,country_name,country_name_only,country_woeid,county_name,county_name_only,county_woeid,latitude,longitude,name_full,name_only,name_woe,place_type,state_name,state_name_only,state_woeid,timezone,woeid
0,United States,United States,23424977,"Bernalillo County, New Mexico, United States",Bernalillo County,12589279,35.105,-106.647,"Albuquerque, New Mexico, United States",Albuquerque,Albuquerque,locality,"New Mexico, United States",New Mexico,2347590,America/Denver,2352824
1,United States,United States,23424977,"Fulton County, Georgia, United States",Fulton County,12587929,33.763,-84.423,"Atlanta, Georgia, United States",Atlanta,Atlanta,locality,"Georgia, United States",Georgia,2347569,America/New_York,2357024


In [247]:
trends_available_df.dtypes

twitter_country         object
tritter_country_code    object
twitter_name            object
twitter_parentid         int64
woeid                    int64
twitter_type            object
dtype: object

In [248]:
abc_df = trends_available_df.merge(loc_info_df, how='inner', on='woeid')

In [249]:
abc_df

Unnamed: 0,twitter_country,tritter_country_code,twitter_name,twitter_parentid,woeid,twitter_type,country_name,country_name_only,country_woeid,county_name,...,latitude,longitude,name_full,name_only,name_woe,place_type,state_name,state_name_only,state_woeid,timezone
0,United States,US,Albuquerque,23424977,2352824,Town,United States,United States,23424977,"Bernalillo County, New Mexico, United States",...,35.105,-106.647,"Albuquerque, New Mexico, United States",Albuquerque,Albuquerque,locality,"New Mexico, United States",New Mexico,2347590,America/Denver
1,United States,US,Atlanta,23424977,2357024,Town,United States,United States,23424977,"Fulton County, Georgia, United States",...,33.763,-84.423,"Atlanta, Georgia, United States",Atlanta,Atlanta,locality,"Georgia, United States",Georgia,2347569,America/New_York


In [287]:
# Get information on the top 50 twitter trends for a specified WOEID
# trends_location = api.trends_place( 2357024 )
trends_location = api.trends_place( 2357024)[0]

In [288]:
pprint(trends_location)

{'as_of': '2019-03-26T21:22:42Z',
 'created_at': '2019-03-26T21:17:18Z',
 'locations': [{'name': 'Atlanta', 'woeid': 2357024}],
 'trends': [{'name': 'Jussie Smollett',
             'promoted_content': None,
             'query': '%22Jussie+Smollett%22',
             'tweet_volume': 581331,
             'url': 'http://twitter.com/search?q=%22Jussie+Smollett%22'},
            {'name': 'Cardi',
             'promoted_content': None,
             'query': 'Cardi',
             'tweet_volume': 79039,
             'url': 'http://twitter.com/search?q=Cardi'},
            {'name': 'Shuri',
             'promoted_content': None,
             'query': 'Shuri',
             'tweet_volume': 44845,
             'url': 'http://twitter.com/search?q=Shuri'},
            {'name': 'Chicago PD',
             'promoted_content': None,
             'query': '%22Chicago+PD%22',
             'tweet_volume': 37636,
             'url': 'http://twitter.com/search?q=%22Chicago+PD%22'},
            {'name': '#Ave

In [290]:
'as_of' not in trends_location

False

In [None]:
def get_trends_for_loc( a_woeid ):
# Get top Twitter trending tweets for a location specified by a WOEID,
# flatten the data, and return it as a list of dictionaries

    # Import trend availability info into a dataframe
    try:
        top_trends = api.trends_place( a_woeid )[0]
        
    except TweepError as e:
        # No top trends info available for this WOEID, return False
        print(f"Error obtaining top trends for WOEID {a_woeid}: ", e)
        return False
    
    ##### CONTINUE DEVELOPING THIS FUNCTION HERE ******************************************************
    #pprint(location_data)
    
    # Return just a useful subset of the location info as flattened dictionary
    key_location_info = {}
    
    # Basic information that should be present for any location
    try:
        key_location_info.update( {
            'woeid': int(location_data['place']['woeid']),
            'name_woe': location_data['place']['woe_name'],
            'name_full': location_data['place']['name'],
            'name_only': location_data['place']['name'].split(",")[0].strip(),
            'place_type': location_data['place']['place_type'],
            'latitude': float(location_data['place']['latitude']),
            'longitude': float(location_data['place']['longitude']),
        })
                
    except:
        print("Error - basic location information not returned for WOEID{a_woeid}: ", sys.exc_info()[0])
    
    # Timezone associated with the location - if available
    try:
        key_location_info.update( {
            'timezone': location_data['place']['timezone']  
        })
        
    except:
        key_location_info.update( {
            'timezone': None
        })
        
    # County associated with the location - if available
    try:
        key_location_info.update( {
            'county_name': location_data['place']['county']['_content'],
            'county_name_only': location_data['place']['county']['_content'].split(",")[0].strip(),
            'county_woeid': int(location_data['place']['county']['woeid']),
        })
    except:
        key_location_info.update( {
            'county_name': None,
            'county_name_only': None,
            'county_woeid': None,
        })
        
    # State associated with the location - if available
    try:
        key_location_info.update( {
            'state_name': location_data['place']['region']['_content'],
            'state_name_only': location_data['place']['region']['_content'].split(",")[0].strip(),
            'state_woeid': int(location_data['place']['region']['woeid']),
        })
    except:
        key_location_info.update( {
            'state_name': None,
            'state_name_only': None,
            'state_woeid': None,
        })
        
    # Country associated with the location - if available
    try:
        key_location_info.update( {
            'country_name': location_data['place']['country']['_content'],
            'country_name_only': location_data['place']['country']['_content'].split(",")[0].strip(),
            'country_woeid': int(location_data['place']['country']['woeid']),
        })
    except:
        key_location_info.update( {
            'country_name': None,
            'country_name_only': None,
            'country_woeid': None, 
        })
    
    return top_trends