In [1]:
# Import required libraries

import requests
import tweepy
import pandas as pd
import json
from pynlp import Stanford
import os
import sys

## Gather

In [3]:
# Download the tweet image predictions tsv file via requests library

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(url.split('/')[-1], mode = 'wb') as file:
    file.write(response.content)

In [134]:
# Read `image predictions tsv file`

predict_dog = pd.read_csv('image-predictions.tsv', sep = '\t')

In [135]:
# Read `twitter archive enhanced.csv` file to get tweet ids for API

twitter_df = pd.read_csv('twitter-archive-enhanced.csv')
# extract tweet ids only for use in API

tweet_id = twitter_df['tweet_id']

In [4]:
# DON'T RUN THIS CELL!
# Authenticate Tweepy PI

consumer_key = '8UqkSZEX9dQ8eeVjTulXO3hrs'
consumer_secret = '6sswY9LumipSfGRrMC4K0O5eRdB0FEHGmzJPJsXN8ivAQhursG'
access_token = '1009804608540200960-Z4DNl7IqVa4CNS0us678UUu5WGyqJU'
access_secret = 'pHgvsF4YSNcePNCBXqTfgmjeWVHQmAzfLZaOyiaOO5sFT'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True,
                 parser=tweepy.parsers.JSONParser())

In [5]:
# DON'T RUN THIS CELL!
# Get tweet JSON data using tweet ID via Tweepy 

tweet_json = []
error_list = []
for i in tweet_id:
    try:
        tweet = api.get_status(i, tweet_mode = 'extended')
        tweet_json.append(tweet)
    except:
        error_list.append(i)
        continue

Rate limit reached. Sleeping for: 348
Rate limit reached. Sleeping for: 242


In [7]:
# DON'T RUN THIS CELL!
# Write JSON data to tweet_json.txt file with each tweet's JSON data on its own line

with open('tweet_json.txt', 'w') as outfile:
    json.dump(tweet_json, outfile, indent = True)

In [136]:
# Read tweet_json.txt file into a pandas data frame 

pd_json = pd.read_json('tweet_json.txt', orient = 'columns')

In [137]:
# Extract only needed columns (tweet_id, favorite_count, retweet_count)
# Save it to tweet_json

tweet_json = pd_json[['id','favorite_count','retweet_count']]

# Assess

### Quality
##### `twitter_df` table
- some tweets was deleted or invalid (2356 instead of 2345)
- missing data in `in_reply_to_status_id` column
- missing data in `in_reply_to_user_id` column
- missing data in `retweeted_status_id` column
- missing data in `retweeted_status_user_id` column
- missing data in `retweeted_status_timestamp` column
- `source` and `expanded_urls` columns are not relevant to our analysis
- `name` content is not always a name, sometimes it's an adjective or articles



##### `predict_dog` table
- missing records because some tweets are without images (2075 instead of 2356) or invalid or deleted
- not all images are for dogs! some images are for other animals

##### `tweet_json` table
- `id` column name is not consistent with other tables same column names

### Tidiness
##### `twitter_df` table
- `doggo`, `floofer`, `pupper` and `puppo` should be values for a `stage` column not seperate columns
- `time stamp` column should be two separate columns for `date` and `time`

##### `tweet_json` table
- the whole table should be merged with `twitter_archive` table 

In [138]:
# Make a copy of the datasets to work with and clean

twitter_df_clean = twitter_df.copy()
predict_dog_clean = predict_dog.copy()
tweet_json_clean = tweet_json.copy()

# Clean
### Define
### Quality 
##### `twitter_df` table
- remove all records with ids in `error_list`

### Code

In [139]:
# remove all records with ids in `error_list`

for record in error_list:
    twitter_df_clean.drop(twitter_df_clean[twitter_df_clean.tweet_id == record].index, inplace = True)

### Test

In [140]:
# check if any record with id from error_list still exists!

twitter_df_clean['tweet_id'].isin(error_list).value_counts()

False    2344
Name: tweet_id, dtype: int64

### Define
### Quality
##### `twitter_df` table
- drop `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`,  
  `retweeted_status_user_id`, `retweeted_status_timestamp`, `source` and `expanded_urls` columns

### Code

In [141]:
# drop 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id',
# 'retweeted_status_user_id' and 'retweeted_status_timestamp' columns from `twitter_archive_clean` table

twitter_df_clean.drop(columns = ['in_reply_to_status_id', 'in_reply_to_user_id',
                                 'retweeted_status_id', 'retweeted_status_user_id',
                                 'retweeted_status_timestamp', 
                                 'source', 'expanded_urls'], inplace = True)

### Test

In [142]:
# look at the data frame to see the columns is removed
twitter_df_clean.tail()

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2351,666049248165822465,2015-11-16 00:24:50 +0000,Here we have a 1949 1st generation vulpix. Enj...,5,10,,,,,
2352,666044226329800704,2015-11-16 00:04:52 +0000,This is a purebred Piers Morgan. Loves to Netf...,6,10,a,,,,
2353,666033412701032449,2015-11-15 23:21:54 +0000,Here is a very happy pup. Big fan of well-main...,9,10,a,,,,
2354,666029285002620928,2015-11-15 23:05:30 +0000,This is a western brown Mitsubishi terrier. Up...,7,10,a,,,,
2355,666020888022790149,2015-11-15 22:32:08 +0000,Here we have a Japanese Irish Setter. Lost eye...,8,10,,,,,


### Define
### Quality
##### `twitter_df` table
-  edit `name` contents to ensure they are real dog names

### Code

In [145]:
# first make sure all records are for dogs by matching the `predict_dog_clean` non-dog ids with ones in `twitter_df_clean`

non_dogs = predict_dog_clean[(predict_dog_clean.p1_dog == False)
                                         | (predict_dog_clean.p2_dog == False)
                                         | (predict_dog_clean.p3_dog == False)].tweet_id

In [147]:
# remove all non-dog records in twitter_df_clean

for record in non_dogs:
    twitter_df_clean.drop(twitter_df_clean[twitter_df_clean.tweet_id == record].index, inplace = True)

### Test

In [158]:
# check if any record with id from non_dogs still exists!

twitter_df_clean['tweet_id'].isin(non_dogs).value_counts()

False    1515
Name: tweet_id, dtype: int64

### Define
### Quality 
##### `predict_dog` table
- remove all non dog records by keeping only the records where `p1_dog`, `p2_dog`, `p3_dog` are all `True`


### Code

In [160]:
# remove all non-dog records by keeping only the records where `p1_dog`, `p2_dog`, `p3_dog` are all `True`

predict_dog_clean.drop(predict_dog_clean[(predict_dog_clean.p1_dog == False)
                                         | (predict_dog_clean.p2_dog == False)
                                         | (predict_dog_clean.p3_dog == False)].index, inplace = True)

### Test

In [161]:
# Check if any non-dog record exists!

predict_dog_clean['tweet_id'].isin(non_dogs).value_counts()

False    1243
Name: tweet_id, dtype: int64

### Define
### Quality 
##### `tweet_json` table
- rename `id` column to `tweet_id` for consistency with other tables

### Code

### Test