In [1]:
import pandas as pd
import tweepy
import requests
import os
from dotenv import load_dotenv
from pathlib import Path

In [2]:
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

True

In [3]:
#Instantiate tweepy object

In [4]:
consumer_key = os.getenv('consumer_key')
consumer_secret = os.getenv('consumer_secret')
access_token = os.getenv('access_token')
access_secret = os.getenv('access_secret')

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

### Gather

In [5]:
# Gather locally provided data on dog rating

In [6]:
dog_ratings_df = pd.read_csv('twitter-archive-enhanced.csv')

In [7]:
# Gather data programatically from a url using requests

In [8]:
response = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

In [9]:
column_headers = response.text.split('\n')[0].split('\t')
data_body = response.text.split('\n')[1:]
response_list = []
twitter_list = []
for row in data_body:
    response_list.append(row.split('\t'))

In [10]:
predictions_df = pd.DataFrame(response_list, columns=column_headers)

In [11]:
predictions_df.sample()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
667,682962037429899265,https://pbs.twimg.com/media/CXpeVzQW8AApKYb.jpg,1,dingo,0.2786,False,Chihuahua,0.1552069999999999,True,loupe,0.153598,False


In [12]:
predictions_df.shape

(2076, 12)

In [13]:
#Gather data from Twitter using tweepy API

In [14]:
twitter_list = []
for row in data_body:
    row_data = row.split('\t')
    tweet = None
    try:
        tweet = api.get_status(row_data[0])._json
    except:
        tweet = {'retweet_count': 0, 'favorite_count': 0}
    twitter_list.append({'tweet_id': row_data[0],'retweets': tweet['retweet_count'], 'likes': tweet['favorite_count']})

In [15]:
twitter_df = pd.DataFrame(twitter_list)

### Assess

In [16]:
dog_ratings_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [17]:
predictions_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.0614285,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.0741916999999999,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.1385839999999999,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [18]:
twitter_df.head()

Unnamed: 0,tweet_id,retweets,likes
0,666020888022790149,460,2412
1,666029285002620928,42,121
2,666033412701032449,41,112
3,666044226329800704,131,272
4,666049248165822465,40,96


In [19]:
dog_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [20]:
dog_ratings_df.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [21]:
dog_ratings_df.isnull().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64

In [23]:
dog_ratings_df['tweet_id'].nunique()

2356

In [25]:
predictions_df.columns

Index(['tweet_id', 'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2',
       'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog'],
      dtype='object')

In [27]:
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2076 entries, 0 to 2075
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  2076 non-null   object
 1   jpg_url   2075 non-null   object
 2   img_num   2075 non-null   object
 3   p1        2075 non-null   object
 4   p1_conf   2075 non-null   object
 5   p1_dog    2075 non-null   object
 6   p2        2075 non-null   object
 7   p2_conf   2075 non-null   object
 8   p2_dog    2075 non-null   object
 9   p3        2075 non-null   object
 10  p3_conf   2075 non-null   object
 11  p3_dog    2075 non-null   object
dtypes: object(12)
memory usage: 194.8+ KB


In [28]:
predictions_df.describe()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
count,2076,2075,2075,2075,2075.0,2075,2075,2075.0,2075,2075,2075.0,2075
unique,2076,2009,4,378,2006.0,2,405,2004.0,2,408,2006.0,2
top,775842724423557120,https://pbs.twimg.com/media/CU3mITUWIAAfyQS.jpg,1,golden_retriever,0.581403,True,Labrador_retriever,0.0693617,True,Labrador_retriever,0.0362507,True
freq,1,2,1780,150,2.0,1532,104,3.0,1553,79,2.0,1499


In [29]:
predictions_df['tweet_id'].nunique()

2076

In [31]:
predictions_df.duplicated('tweet_id').sum()

0

In [32]:
twitter_df.columns

Index(['tweet_id', 'retweets', 'likes'], dtype='object')

In [33]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2076 entries, 0 to 2075
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  2076 non-null   object
 1   retweets  2076 non-null   int64 
 2   likes     2076 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 48.8+ KB


In [34]:
twitter_df.describe()

Unnamed: 0,retweets,likes
count,2076.0,2076.0
mean,2561.215318,7904.015414
std,4496.566689,12009.226246
min,0.0,0.0
25%,541.0,1477.0
50%,1210.5,3475.5
75%,2948.0,9797.25
max,77687.0,155880.0


In [35]:
twitter_df['tweet_id'].nunique()

2076

In [39]:
twitter_df.to_csv('twitter_data.csv', index=False)

# Issues in Data

## Missing Data

### Dog Ratings

1. in_reply_to_status_id column has 2278 rows with missing values
2. in_reply_to_user_id column has 2278 missing values
3. retweeted_status_id column has 2175 missig values
4. retweeted_status_user_id column has 2175 missing values
5. expanded urls column has 59 missing values

### Predictions

1. tweet_id column has one more data item than all the other rows

## Data tidiness

In [38]:
dog_ratings_df.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')

### Dog Ratings

1. in_reply_to_status_id, in_reply_to_user_id, timestamp, retweeted_status_id, retweeted_user_id, retweeted_status_timetstamp should not be part of dog_ratings_df 

### Predictions

1. image_num column has the same data for all rows; it adds no statistical value

## Data cleanliness

### Clean