In [2]:
# Import required libraries

import requests
import tweepy
import pandas as pd
import json
import os
import sys

## Gather

In [3]:
# Download the tweet image predictions tsv file via requests library

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(url.split('/')[-1], mode = 'wb') as file:
    file.write(response.content)

In [3]:
# Read `image predictions tsv file`

image_predictions = pd.read_csv('image-predictions.tsv', sep = '\t')

In [4]:
# Read `twitter archive enhanced.csv` file to get tweet ids for API

twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
# extract tweet ids only for use in API

tweet_id = twitter_archive['tweet_id']

In [5]:
# DON'T RUN THIS CELL!
# Authenticate Tweepy PI

consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True,
                 parser=tweepy.parsers.JSONParser())

In [6]:
# DON'T RUN THIS CELL!
# Get tweet JSON data using tweet ID via Tweepy 

tweet_json = []
error_list = []
for i in tweet_id:
    try:
        tweet = api.get_status(i, tweet_mode = 'extended')
        tweet_json.append(tweet)
    except:
        error_list.append(i)
        continue

Rate limit reached. Sleeping for: 170
Rate limit reached. Sleeping for: 27


In [9]:
# DON'T RUN THIS CELL!
# Write JSON data to tweet_json.txt file with each tweet's JSON data on its own line

with open('tweet_json.txt', 'w') as outfile:
    json.dump(tweet_json, outfile, indent = True)

In [5]:
# Read tweet_json.txt file into a pandas data frame 

tweet_json = pd.read_json('tweet_json.txt', orient = 'columns')

In [6]:
# Extract only needed columns (tweet_id, favorite_count, retweet_count)
# Save it to tweet_df_trim 

tweet_json_trim = tweet_json[['id','favorite_count','retweet_count']]

# Assess

### Quality
##### `twitter_archive` table
- missing data in 'in_reply_to_status_id' column
- missing data in 'in_reply_to_user_id' column
- missing data in 'retweeted_status_id' column
- missing data in 'retweeted_status_user_id' column
- missing data in 'retweeted_status_timestamp' column
- `name` content is not always a name, sometimes it's an adjective or articles
- some tweets was deleted or invalid (2356 instead of 2345)

##### `image_predictions` table
- not all images are for dogs! some images are for other animals
- missing records because some tweets are without images (2075 instead of 2356) or invalid or deleted

##### `tweet_json_trim` table
- `id` column name is not consistent with other tables same column names

### Tidiness
##### `twitter_archive` table
- `doggo`, `floofer`, `pupper` and `puppo` should be values for a `stage` column not seperate columns
- `time stamp` column should be two separate columns for `date` and `time`
- `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id`
  and `retweeted_status_timestamp` columns are almost full of nulls and don't have our desired information

##### `tweet_json_trim` table
- the whole table should be merged with `twitter_archive` table 

In [7]:
# Make a copy of the datasets to work with and clean

twitter_archive_clean = twitter_archive.copy()
image_predictions_clean = image_predictions.copy()
tweet_json_trim_clean = tweet_json_trim.copy()

# Clean
### Define
### Quality
##### `twitter_archive` table
- drop 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id',  
  'retweeted_status_user_id' and 'retweeted_status_timestamp' columns

### Code

In [8]:
# drop 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id',
# 'retweeted_status_user_id' and 'retweeted_status_timestamp' columns from `twitter_archive_clean` table

twitter_archive_clean = twitter_archive_clean.drop(columns = ['in_reply_to_status_id', 'in_reply_to_user_id',
                                                              'retweeted_status_id', 'retweeted_status_user_id',
                                                              'retweeted_status_timestamp'])

### Test

In [9]:
twitter_archive_clean.head(2)

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,


### Define
##### `twitter_archive` table
- remove all records with ids in `error_list`

##### `image_predictions` table
- remove all non dog records by keeping only the records where `p1_dog`, `p2_dog`, `p3_dog` are all `True`
- remove all records with ids in `error_list`

##### `tweet_json_trim` table
- rename `id` to `tweet_id` to be consistent with other tables

### Tidiness
##### `twitter_archive` table
- add `stage` column to contain dog stage status
- split `time stamp` column into `date` column for date and `time` column for time
- drop `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`,
  `retweeted_status_user_id` and `retweeted_status_timestamp` columns
  
##### `tweet_json_trim` table
- merge `tweet_json_trim` with `twitter_archive` table on tweet_id condition


### Code

### Test

### Define
##### `twitter_archive` table
-  edit `name` contents to ensure they are real dog names