In [1]:
# Import required libraries

import requests
import tweepy
import pandas as pd
import json
import os
import sys

## Gathering

In [3]:
# Download the tweet image predictions tsv file via requests library

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(url.split('/')[-1], mode = 'wb') as file:
    file.write(response.content)

In [10]:
# Read `image predictions tsv file`

image_predictions = pd.read_csv('image-predictions.tsv', sep = '\t')

In [2]:
# Read `twitter archive enhanced.csv` file to get tweet ids for API

twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
# extract tweet ids only for use in API

tweet_id = twitter_archive['tweet_id']

In [5]:
# DON'T RUN THIS CELL!
# Authenticate Tweepy PI

consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True,
                 parser=tweepy.parsers.JSONParser())

In [6]:
# DON'T RUN THIS CELL!
# Get tweet JSON data using tweet ID via Tweepy 

tweet_json = []
error_list = []
for i in tweet_id:
    try:
        tweet = api.get_status(i, tweet_mode = 'extended')
        tweet_json.append(tweet)
    except:
        error_list.append(i)
        continue

Rate limit reached. Sleeping for: 170
Rate limit reached. Sleeping for: 27


In [9]:
# DON'T RUN THIS CELL!
# Write JSON data to tweet_json.txt file with each tweet's JSON data on its own line

with open('tweet_json.txt', 'w') as outfile:
    json.dump(tweet_json, outfile, indent = True)

In [13]:
# Read tweet_json.txt file into a pandas data frame 

tweet_json = pd.read_json('tweet_json.txt', orient = 'columns')

In [14]:
# Extract only needed columns (tweet_id, favorite_count, retweet_count)
# Save it to tweet_df_trim 

tweet_json_trim = tweet_json[['id','favorite_count','retweet_count']]

# Assessing

### Quality
##### `twitter_archive` table
- missing ids in 'in_reply_to_status_id' column
- missing ids in 'in_reply_to_user_id' column
- missing ids in 'retweeted_status_id' column
- missing ids in 'retweeted_status_user_id' column
- missing ids in 'retweeted_status_timestamp' column
- not all `name` content is a name! sometimes it's an adjective or articles

##### `image_predictions` table
- not all images are for dogs! some images are for other animals
- missing records because some tweets are without images (270 missing)

##### `tweet_json_trim` table
- missing tweet ids for invalid or deleted ones (11 missing)

### Tidiness
##### `twitter_archive` table
- `doggo`, `floofer`, `pupper` and `puppo` columns should be one column
- 
- 
- 
- 
- 

##### `image_predictions` table
- not all images are for dogs! some images are for other animals
- missing records because some tweets are without images (270 missing)

##### `tweet_json_trim` table
- missing tweet ids for invalid or deleted ones (11 missing)

In [49]:
twitter_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [39]:
twitter_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [35]:
image_predictions.duplicated().value_counts()

False    2075
dtype: int64

In [27]:
tweet_json_trim.duplicated().value_counts()

False    2345
dtype: int64