In [1]:
import pandas as pd
import tweepy
import requests
import os
from dotenv import load_dotenv
from pathlib import Path

In [2]:
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

True

In [3]:
#Instantiate tweepy object

In [4]:
consumer_key = os.getenv('consumer_key')
consumer_secret = os.getenv('consumer_secret')
access_token = os.getenv('access_token')
access_secret = os.getenv('access_secret')

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

### Gather

In [5]:
# Gather locally provided data on dog rating

In [6]:
dog_ratings_df = pd.read_csv('twitter-archive-enhanced.csv')

In [7]:
# Gather data programatically from a url using requests

In [8]:
response = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

In [9]:
column_headers = response.text.split('\n')[0].split('\t')
data_body = response.text.split('\n')[1:]
response_list = []
twitter_list = []
%time
for row in data_body:
    response_list.append(row.split('\t'))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [10]:
predictions_df = pd.DataFrame(response_list, columns=column_headers)

In [11]:
predictions_df.sample()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
492,675706639471788032,https://pbs.twimg.com/media/CWCXj35VEAIFvtk.jpg,1,English_springer,0.9903,True,Welsh_springer_spaniel,0.00207991,True,cocker_spaniel,0.00201378,True


In [12]:
predictions_df.shape

(2076, 12)

In [13]:
twitter_list

[]

In [14]:
#Gather data from Twitter using tweepy API

##### Mental note
The O(N<sup>2</sup>) is not efficient enough computationally for tweepy.
However, the CPU times are much less than wall times - suggesting communication overhead.

In [15]:
twitter_list = []
for row in data_body:
    row_data = row.split('\t')
    tweet = None
    try:
        tweet = api.get_status(row_data[0])._json
    except:
        tweet = {'retweet_count': 0, 'favorite_count': 0}
#     %time
    twitter_list.append({'tweet_id': row_data[0],'retweets': tweet['retweet_count'], 'likes': tweet['favorite_count']})

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10 µs
CPU times: user 9 µs, sys: 1 µs, total: 10 µs
Wall time: 18.8 µs
CPU times: user 12 µs, sys: 2 µs, total: 14 µs
Wall time: 26.2 µs
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.54 µs
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.82 µs
CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.78 µs
CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 11 µs
CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.39 µs
CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 10.7 µs
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs
CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 9.3 µs
CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.06 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs
CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 17.2 µs
CPU times: user 4 µs, sys: 0 ns, total: 

In [24]:
twitter_df = pd.DataFrame(twitter_list)

In [31]:
twitter_df = pd.DataFrame(twitter_list)

In [32]:
twitter_df.head()

Unnamed: 0,retweet_count,likes
0,461,2411
1,42,121
2,41,112
3,132,271
4,39,96


In [None]:
twitter_df.head()

### Assess

In [33]:
dog_ratings_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [34]:
predictions_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.0614285,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.0741916999999999,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.1385839999999999,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [35]:
twitter_df.head()

Unnamed: 0,retweet_count,likes
0,461,2411
1,42,121
2,41,112
3,132,271
4,39,96


### Clean