In [122]:
# Import required libraries

import requests
import tweepy
import pandas as pd
import json
import datetime 

## Gather

In [3]:
# Download the tweet image predictions tsv file via requests library

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(url.split('/')[-1], mode = 'wb') as file:
    file.write(response.content)

In [123]:
# Read `image predictions tsv file`

predict_dog = pd.read_csv('image-predictions.tsv', sep = '\t')

In [124]:
# Read `twitter archive enhanced.csv` file to get tweet ids for API

twitter_df = pd.read_csv('twitter-archive-enhanced.csv')
# extract tweet ids only for use in API

tweet_id = twitter_df['tweet_id']

In [126]:
# Authenticate Tweepy PI

consumer_key = '8UqkSZEX9dQ8eeVjTulXO3hrs'
consumer_secret = '6sswY9LumipSfGRrMC4K0O5eRdB0FEHGmzJPJsXN8ivAQhursG'
access_token = '1009804608540200960-Z4DNl7IqVa4CNS0us678UUu5WGyqJU'
access_secret = 'pHgvsF4YSNcePNCBXqTfgmjeWVHQmAzfLZaOyiaOO5sFT'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True,
                 parser=tweepy.parsers.JSONParser())

In [127]:
# Get tweet JSON data using tweet ID via Tweepy 

tweet_json = []
error_list = []
for i in tweet_id:
    try:
        tweet = api.get_status(i, tweet_mode = 'extended')
        tweet_json.append(tweet)
    except:
        error_list.append(i)
        continue

Rate limit reached. Sleeping for: 355
Rate limit reached. Sleeping for: 373


In [128]:
# Write JSON data to tweet_json.txt file with each tweet's JSON data on its own line

with open('tweet_json.txt', 'w') as outfile:
    json.dump(tweet_json, outfile, indent = True)

In [129]:
# Read tweet_json.txt file into a pandas data frame 

pd_json = pd.read_json('tweet_json.txt', orient = 'columns')

In [130]:
# Extract only needed columns (tweet_id, favorite_count, retweet_count)
# Save it to tweet_json

tweet_json = pd_json[['id','favorite_count','retweet_count']]

# Assess

### Quality
##### `twitter_df` table
- some tweets was deleted or invalid (2356 instead of 2345)
- missing data in `in_reply_to_status_id` column
- missing data in `in_reply_to_user_id` column
- missing data in `retweeted_status_id` column
- missing data in `retweeted_status_user_id` column
- missing data in `retweeted_status_timestamp` column
- `source` and `expanded_urls` columns are not relevant to our analysis
- `name` content is not always a name, sometimes it's an adjective or articles



##### `predict_dog` table
- missing records because some tweets are without images (2075 instead of 2356) or invalid or deleted
- not all images are for dogs! some images are for other animals
- `jpg_url` and `img_num` columns are not releveant to our analysis

##### `tweet_json` table
- `id` column name is not consistent with other tables same column names

### Tidiness
##### `twitter_df` table
- `doggo`, `floofer`, `pupper` and `puppo` should be values for a `stage` column not seperate columns
- `time stamp` column should be two separate columns for `date` and `time`

##### `tweet_json` table
- the whole table should be merged with `twitter_archive` table 

In [234]:
# Make a copy of the datasets to work with and clean

twitter_df_clean = twitter_df.copy()
predict_dog_clean = predict_dog.copy()
tweet_json_clean = tweet_json.copy()

# Clean
### Define
### Quality 
##### `twitter_df_clean` table
- remove all records with ids in `error_list`

### Code

In [235]:
# remove all records with ids in `error_list`

for record in error_list:
    twitter_df_clean.drop(twitter_df_clean[twitter_df_clean.tweet_id == record].index, inplace = True)

### Test

In [236]:
# check if any record with id from error_list still exists!

twitter_df_clean['tweet_id'].isin(error_list).value_counts()

False    2344
Name: tweet_id, dtype: int64

### Define
### Quality
##### `twitter_df_clean` table
- drop `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`,  
  `retweeted_status_user_id`, `retweeted_status_timestamp`, `source` and `expanded_urls` columns

### Code

In [237]:
# drop 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id',
# 'retweeted_status_user_id' and 'retweeted_status_timestamp' columns from `twitter_df_clean` table

twitter_df_clean.drop(columns = ['in_reply_to_status_id', 'in_reply_to_user_id',
                                 'retweeted_status_id', 'retweeted_status_user_id',
                                 'retweeted_status_timestamp', 
                                 'source', 'expanded_urls'], inplace = True)

### Test

In [238]:
# look at the data frame to see the columns is removed

twitter_df_clean.tail()

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2351,666049248165822465,2015-11-16 00:24:50 +0000,Here we have a 1949 1st generation vulpix. Enj...,5,10,,,,,
2352,666044226329800704,2015-11-16 00:04:52 +0000,This is a purebred Piers Morgan. Loves to Netf...,6,10,a,,,,
2353,666033412701032449,2015-11-15 23:21:54 +0000,Here is a very happy pup. Big fan of well-main...,9,10,a,,,,
2354,666029285002620928,2015-11-15 23:05:30 +0000,This is a western brown Mitsubishi terrier. Up...,7,10,a,,,,
2355,666020888022790149,2015-11-15 22:32:08 +0000,Here we have a Japanese Irish Setter. Lost eye...,8,10,,,,,


### Define
### Quality
##### `twitter_df_clean` table
-  remove non-dog contents to ensure they are dog only records

### Code

In [239]:
# make sure all records are for dogs by matching the `predict_dog_clean` non-dog ids with ones in `twitter_df_clean`

non_dogs = predict_dog_clean[(predict_dog_clean.p1_dog == False)
                                         | (predict_dog_clean.p2_dog == False)
                                         | (predict_dog_clean.p3_dog == False)].tweet_id

In [240]:
# remove all non-dog records in twitter_df_clean

for record in non_dogs:
    twitter_df_clean.drop(twitter_df_clean[twitter_df_clean.tweet_id == record].index, inplace = True)

### Test

In [241]:
# check if any record with id from non_dogs still exists!

twitter_df_clean['tweet_id'].isin(non_dogs).value_counts()

False    1515
Name: tweet_id, dtype: int64

### Define
### Quality 
##### `predict_dog_clean` table
- remove all non dog records by keeping only the records where `p1_dog`, `p2_dog`, `p3_dog` are all `True`

### Code

In [242]:
# remove all non-dog records by keeping only the records where `p1_dog`, `p2_dog`, `p3_dog` are all `True`

predict_dog_clean.drop(predict_dog_clean[(predict_dog_clean.p1_dog == False)
                                         | (predict_dog_clean.p2_dog == False)
                                         | (predict_dog_clean.p3_dog == False)].index, inplace = True)

### Test

In [243]:
# Check if any non-dog record exists!

predict_dog_clean['tweet_id'].isin(non_dogs).value_counts()

False    1243
Name: tweet_id, dtype: int64

In [244]:
# reset all indeces in all tables to keep every thing in a consistent manner

twitter_df_clean = twitter_df_clean.reset_index()
predict_dog_clean = predict_dog_clean.reset_index()

In [245]:
# drop all `index` columns in all tables for consistency

twitter_df_clean.drop(columns = ['index'], inplace = True)
predict_dog_clean.drop(columns = ['index'], inplace = True)

### Define
### Quality 
##### `tweet_json_clean` table
- rename `id` column to `tweet_id` for consistency with other tables

### Code

In [246]:
# rename `id` column to `tweet_id` for consistency with other tables

tweet_json_clean = tweet_json_clean.rename(columns = {'id' : 'tweet_id'})

### Test

In [247]:
# check the column name is changed 

tweet_json_clean.head()

Unnamed: 0,tweet_id,favorite_count,retweet_count
0,892420643555336193,38671,8555
1,892177421306343426,33129,6287
2,891815181378084864,24943,4167
3,891689557279858688,42045,8683
4,891327558926688256,40193,9442


### Define
### Quality
##### `predict_dog_clean` table
- drop `jpg_url` and `img_num` columns

### Code

In [262]:
# drop `jpg_url` and `img_num` columns from predict_dog_clean table

predict_dog_clean.drop(columns = ['jpg_url', 'img_num'], inplace = True)

### Define
### Tidiness
##### `twitter_df_clean` table
- melt `doggo`, `floofer`, `pupper` and `puppo` as values for a `stage` column 

### Code

In [248]:
# turn `doggo`, `floofer`, `pupper` and `puppo` as values for a `stage` column 

twitter_df_clean['stage'] = 'None'

for r in range(len(twitter_df_clean)-1):
    
    if twitter_df_clean['doggo'][r] == 'doggo':
        
        twitter_df_clean['stage'][r] = 'doggo'
    
    elif twitter_df_clean['floofer'][r] == 'floofer':
        
        twitter_df_clean['stage'][r] = 'floofer'

    elif twitter_df_clean['pupper'][r] == 'pupper':
        
        twitter_df_clean['stage'][r] = 'pupper'
        
    elif twitter_df_clean['puppo'][r] == 'puppo':
        
        twitter_df_clean['stage'][r] = 'puppo'
        
    else:
        twitter_df_clean['stage'][r] = 'None'    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [249]:
# drop `doggo`, `floofer`, `pupper` and `puppo` columns

twitter_df_clean.drop(columns = ['doggo', 'floofer',
                                 'pupper', 'puppo'], inplace = True)

### Test

In [250]:
# check stage values are in place

twitter_df_clean['stage'].value_counts()

None       1260
pupper      156
doggo        67
puppo        24
floofer       8
Name: stage, dtype: int64

In [251]:
# check un needed columns are dropped 

twitter_df_clean.head(20)

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,stage
0,892177421306343426,2017-08-01 00:17:27 +0000,This is Tilly. She's just checking pup on you....,13,10,Tilly,
1,891815181378084864,2017-07-31 00:18:03 +0000,This is Archie. He is a rare Norwegian Pouncin...,12,10,Archie,
2,891327558926688256,2017-07-29 16:00:24 +0000,This is Franklin. He would like you to stop ca...,12,10,Franklin,
3,890729181411237888,2017-07-28 00:22:40 +0000,When you watch your owner call another dog a g...,13,10,,
4,890609185150312448,2017-07-27 16:25:51 +0000,This is Zoey. She doesn't want to be one of th...,13,10,Zoey,
5,890240255349198849,2017-07-26 15:59:51 +0000,This is Cassie. She is a college pup. Studying...,14,10,Cassie,doggo
6,890006608113172480,2017-07-26 00:31:25 +0000,This is Koda. He is a South Australian decksha...,13,10,Koda,
7,889665388333682689,2017-07-25 01:55:32 +0000,Here's a puppo that seems to be on the fence a...,13,10,,puppo
8,889638837579907072,2017-07-25 00:10:02 +0000,This is Ted. He does his best. Sometimes that'...,12,10,Ted,
9,889531135344209921,2017-07-24 17:02:04 +0000,This is Stuart. He's sporting his favorite fan...,13,10,Stuart,puppo


### Define
### Tidiness
##### `twitter_df_clean` table
- split `time stamp` column to two separate columns `date` and `time`

### Code

In [252]:
# First, convert the `timestamp` column type to datetime

twitter_df_clean['timestamp'] = pd.to_datetime(twitter_df_clean['timestamp'])

In [253]:
# second, split `time stamp` column to two separate columns `date` and `time`

twitter_df_clean['date'] = [d.date() for d in twitter_df_clean['timestamp']]
twitter_df_clean['time'] = [d.time() for d in twitter_df_clean['timestamp']]

In [254]:
# finally, drop the `timestamp` column as it is no longer needed

twitter_df_clean.drop(columns = ['timestamp'], inplace = True)

### Test

In [255]:
# check type of `timestamp` column is datetime

twitter_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1515 entries, 0 to 1514
Data columns (total 8 columns):
tweet_id              1515 non-null int64
text                  1515 non-null object
rating_numerator      1515 non-null int64
rating_denominator    1515 non-null int64
name                  1515 non-null object
stage                 1515 non-null object
date                  1515 non-null object
time                  1515 non-null object
dtypes: int64(3), object(5)
memory usage: 94.8+ KB


In [256]:
# check the columns are created correctly

twitter_df_clean.head()

Unnamed: 0,tweet_id,text,rating_numerator,rating_denominator,name,stage,date,time
0,892177421306343426,This is Tilly. She's just checking pup on you....,13,10,Tilly,,2017-08-01,00:17:27
1,891815181378084864,This is Archie. He is a rare Norwegian Pouncin...,12,10,Archie,,2017-07-31,00:18:03
2,891327558926688256,This is Franklin. He would like you to stop ca...,12,10,Franklin,,2017-07-29,16:00:24
3,890729181411237888,When you watch your owner call another dog a g...,13,10,,,2017-07-28,00:22:40
4,890609185150312448,This is Zoey. She doesn't want to be one of th...,13,10,Zoey,,2017-07-27,16:25:51


In [257]:
# check `timestamp` column is dropped 

twitter_df_clean.tail()

Unnamed: 0,tweet_id,text,rating_numerator,rating_denominator,name,stage,date,time
1510,666049248165822465,Here we have a 1949 1st generation vulpix. Enj...,5,10,,,2015-11-16,00:24:50
1511,666044226329800704,This is a purebred Piers Morgan. Loves to Netf...,6,10,a,,2015-11-16,00:04:52
1512,666033412701032449,Here is a very happy pup. Big fan of well-main...,9,10,a,,2015-11-15,23:21:54
1513,666029285002620928,This is a western brown Mitsubishi terrier. Up...,7,10,a,,2015-11-15,23:05:30
1514,666020888022790149,Here we have a Japanese Irish Setter. Lost eye...,8,10,,,2015-11-15,22:32:08


### Define
### Tidiness
##### `tweet_json_clean` table
- merge `tweet_json_clean` table with `twitter_df_clean` table 

### Code

In [258]:
# merge `tweet_json_clean` table with `twitter_df_clean` table 

twitter_df_clean = pd.merge(twitter_df_clean, tweet_json_clean, on = 'tweet_id')

### Test

In [264]:
# check the merge is done correctly

twitter_df_clean.head()

Unnamed: 0,tweet_id,text,rating_numerator,rating_denominator,name,stage,date,time,favorite_count,retweet_count
0,892177421306343426,This is Tilly. She's just checking pup on you....,13,10,Tilly,,2017-08-01,00:17:27,33129,6287
1,891815181378084864,This is Archie. He is a rare Norwegian Pouncin...,12,10,Archie,,2017-07-31,00:18:03,24943,4167
2,891327558926688256,This is Franklin. He would like you to stop ca...,12,10,Franklin,,2017-07-29,16:00:24,40193,9442
3,890729181411237888,When you watch your owner call another dog a g...,13,10,,,2017-07-28,00:22:40,65311,18964
4,890609185150312448,This is Zoey. She doesn't want to be one of th...,13,10,Zoey,,2017-07-27,16:25:51,27705,4278


In [263]:
predict_dog_clean.head()

Unnamed: 0,tweet_id,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [None]:
# writing cleaned data to a master csv file

twitter_df_cleaned.to_csv('twitter_archive_master.csv', encoding='utf-8', index=False)