In [451]:
# dependencies
import pandas as pd
import numpy as np
import requests
import tweepy
from tweepy import OAuthHandler
import json
import os
import json
import timeit
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

# options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Gathering Data for this Project

#### 1. WeRateDogs Twitter Archive

In [452]:
df_1 = pd.read_csv('twitter-archive-enhanced.csv')

#### 2. Tweet Image Predictions

In [453]:
# download tsv using Requests library
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
r = requests.get(url, allow_redirects = True)
open('image_predictions.tsv', 'wb').write(r.content)

# import the tsv file containing ANN image predictions
df_2 = pd.read_csv('image_predictions.tsv', sep='\t')

#### 3. Twitter API Query
Unfortunately I was denied by the Twitter to have a developers account. Seemed like a bot was stuck in a loop contacting me. I sent verbatim the recommended script by Udacity for approval but they responded asking for "more" information, although they just asked for the same information again. I retried, this time following the same ideas but in my own words and again they responded they needed more information. After the third attempt they said I was denied developers access.

So below is the code and data supplied by Udacity, although it's dissapointing taking this way out. A recommendation for Udacity is to give a heads up that something like this may be required for a project, like when the studen starts the module. Only concerning from a time perspective, I would have tried setting up the Twitter developers acount earlier when I had more time. The initial request took 18 hours to get a response. In total it was 48 hours between all the back and forth. If I was more pressed for time this could have been a problem for a student closing in a on due date.

```python
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = df_1.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)
```

In [454]:
# convert the JSON data in tweet_json.txt to a pandas dataframe
tweets_data = []
tweet_file = open('tweet_json.txt', "r")
for line in tweet_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue   
tweet_file.close()
tweet_info = pd.DataFrame()

# populate the dataframe with desired data columns
def simple_tweet(tweet):
    """extract fields of interest from tweet dict"""
    return {key: tweet[key]
            for key in ['id', 'retweet_count', 'favorite_count']}
tweets = [simple_tweet(tweet) for tweet in tweets_data]
df_3 = pd.DataFrame(tweets)

## Assessing Data for this Project

#### WeRateDogs Twitter Archive

In [455]:
print(df_1.shape)
print('-' * 20)
print(df_1.info())
print('-' * 20)
print(df_1.head())

(2356, 17)
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object


In [456]:
df_1.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [457]:
# check there are no duplicates
df_1['tweet_id'].nunique()

2356

In [458]:
df_1['rating_denominator'].unique()

array([ 10,   0,  15,  70,   7,  11, 150, 170,  20,  50,  90,  80,  40,
       130, 110,  16, 120,   2], dtype=int64)

In [459]:
# seeing if denominator feature is really necessary
rows = df_1.shape[0]
EQ, LT, GT = 0, 0, 0
for index, row in df_1.iterrows():
    if df_1['rating_numerator'][index] == df_1['rating_denominator'][index]:
        EQ += 1
    if df_1['rating_numerator'][index] < df_1['rating_denominator'][index]:
        LT += 1   
    if df_1['rating_numerator'][index] > df_1['rating_denominator'][index]:
        GT += 1 
print('total rows: ' + str(rows))
print(f'Equals: {EQ} / Less than: {LT} / Greater than: {GT}')
print(f'Sum of counts: {EQ+LT+GT}')
# at first I thought all the ratings would be greater than one, but it's not the case. The denominator is necessary.

total rows: 2356
Equals: 463 / Less than: 442 / Greater than: 1451
Sum of counts: 2356


In [460]:
df_1.sample(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2320,666437273139982337,,,2015-11-17 02:06:42 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we see a lone northeastern Cumberbatch. H...,,,,https://twitter.com/dog_rates/status/666437273...,7,10,,,,,
1576,687460506001633280,,,2016-01-14 02:25:31 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Kramer. He's a Picasso Tortellini. Tie...,,,,https://twitter.com/dog_rates/status/687460506...,10,10,Kramer,,,,
1127,729838605770891264,7.291135e+17,4196984000.0,2016-05-10 01:00:58 +0000,"<a href=""http://twitter.com/download/iphone"" r...","""Challenge completed"" \n(pupgraded to 12/10) h...",,,,https://twitter.com/dog_rates/status/729838605...,12,10,,,,,
259,843235543001513987,,,2017-03-18 22:59:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tycho. She just had new wheels install...,,,,https://twitter.com/dog_rates/status/843235543...,13,10,Tycho,,,,
2184,668988183816871936,,,2015-11-24 03:03:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Honor to rate this dog. Lots of fur on him. Tw...,,,,https://twitter.com/dog_rates/status/668988183...,7,10,,,,,
452,818614493328580609,,,2017-01-10 00:24:38 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bear. He's a passionate believer of th...,,,,https://twitter.com/dog_rates/status/818614493...,12,10,Bear,,,,
152,862722525377298433,,,2017-05-11 17:34:13 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Dave. He passed the h*ck out. It's bar...,,,,https://twitter.com/dog_rates/status/862722525...,11,10,Dave,,,,
947,752519690950500352,,,2016-07-11 15:07:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Hopefully this puppo on a swing will help get ...,,,,https://twitter.com/dog_rates/status/752519690...,11,10,,,,,puppo
1955,673636718965334016,,,2015-12-06 22:54:44 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a Lofted Aphrodisiac Terrier named Kip...,,,,https://twitter.com/dog_rates/status/673636718...,10,10,a,,,,
498,813130366689148928,8.131273e+17,4196984000.0,2016-12-25 21:12:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I've been informed by multiple sources that th...,,,,,12,10,,,,,


In [461]:
count = df_1.groupby('doggo').doggo.count()
print('doggo count ' + str(count[1]))
count = df_1.groupby('floofer').floofer.count()
print('floofer count  ' + str(count[1]))
count = df_1.groupby('pupper').pupper.count()
print('pupper count  ' + str(count[1]))
count = df_1.groupby('puppo').puppo.count()
print('puppo count  ' + str(count[1]))

doggo count 97
floofer count  10
pupper count  257
puppo count  30


In [462]:
df_1['source'].nunique()

4

In [463]:
df_1['expanded_urls'].nunique()

2218

In [464]:
# check the dog type catagory 
cats = ['doggo', 
        'floofer',
        'pupper',
        'puppo',]

for c in cats:
    print(df_1[c].count())
    print(df_1[c].nunique())
    print(df_1[c].unique())

2356
2
['None' 'doggo']
2356
2
['None' 'floofer']
2356
2
['None' 'pupper']
2356
2
['None' 'puppo']


### Tweet Image Predictions

In [465]:
print(df_2.shape)
print('-' * 20)
print(df_2.info())
print('-' * 20)
print(df_2.head())

(2075, 12)
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
None
--------------------
             tweet_id                                          jpg_url  \
0  666020888022790149  https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg   
1  666029285002620928  https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg   
2  666033412701032449  https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg   
3  666044226329800704  https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg   
4

In [466]:
# check there are no duplicates
df_2['tweet_id'].nunique()

2075

In [467]:
df_2['img_num'].nunique()

4

In [468]:
df_2.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


### Twitter API Query

In [469]:
print(df_3.shape)
print('-' * 20)
print(df_3.info())
print('-' * 20)
print(df_3.head())

(838, 3)
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838 entries, 0 to 837
Data columns (total 3 columns):
favorite_count    838 non-null int64
id                838 non-null int64
retweet_count     838 non-null int64
dtypes: int64(3)
memory usage: 19.7 KB
None
--------------------
   favorite_count                  id  retweet_count
0           39467  892420643555336193           8853
1           33819  892177421306343426           6514
2           25461  891815181378084864           4328
3           42908  891689557279858688           8964
4           41048  891327558926688256           9774


In [470]:
df_3.describe()

Unnamed: 0,favorite_count,id,retweet_count
count,838.0,838.0,838.0
mean,15188.225537,8.245384e+17,5736.091885
std,15236.248317,3.469656e+16,6515.024337
min,0.0,7.671914e+17,0.0
25%,5900.75,7.944983e+17,2351.25
50%,11913.0,8.222039e+17,3890.0
75%,21572.0,8.521307e+17,6542.75
max,132810.0,8.924206e+17,56625.0


In [471]:
df_3.sample(10)

Unnamed: 0,favorite_count,id,retweet_count
746,21252,779834332596887552,8237
428,10645,821107785811234820,2487
436,21979,820078625395449857,7246
552,49774,804026241225523202,18876
527,12595,808501579447930884,3007
447,49960,819006400881917954,21794
20,30779,887705289381826560,5609
237,23108,847116187444137987,3583
466,13367,817120970343411712,3011
166,16105,859851578198683649,3780


In [472]:
# check there are no duplicates
df_3['id'].nunique()

838

## Laundry List

#### WeRateDogs Twitter Archive
- 'timestamp' is the incorrect datatype.
- 'in_replay_to_status_id', 'in_reply_user_id','retweeted_status_id', 'retweeted_status_user_id',                      'retweeted_status_timestamp' are incomplete and should not be included in the data set.
- remove rows with 'rating_numerator' or 'rating_denominator' at 0, will cause issues with math functions.
- 'source' and 'expanded URLs' don't add anything to reporting the data.
- rename 'text' column to 'description'
- normalize the rating columns
- 'doggo', 'floofer', 'pupper', 'puppo' are catagories

#### Tweet Image Predictions
- 'img_num' should be renamed and changed to catagorical
- 'p1_dog', 'p2_dog', and 'p3_dog' add no value and should be dropped
- all dog breed need to be formatted for capitolized letters or not
- rename 'p1', 'p2', 'p3' to something that makes more sense
- remove '\_' and '-' in breed name
- remove mislabeled breeds like 'dishwasher' and consolidate breeds into a single column of the highest probability
    - if all three values in the prediction column are not dogs the row will be dropped

#### Twitter API Query
- There is only one cleaning activity for this data set, the match the feature name 'id' with the other two dataframes
    - the dataframe is complete
    - looking at the 'describe()' function, none of the values seem out of line, although it is strange some images were retweeted often but never favorited. This does not seem like a reason to reject the observations.
    - the features are all appropriate data types
    - there are no duplicate tweet ID numbers

## Cleaning Data for this Project

#### WeRateDogs Twitter Archive

In [473]:
# change 'timestamp' column to datetime datatype
df_1['timestamp'] = pd.to_datetime(df_1['timestamp'])
print(df_1['timestamp'].dtypes)
print(np.dtype('datetime64[ns]') == np.dtype('<M8[ns]'))

datetime64[ns]
True


In [474]:
# drop any rows containing values for 'in_replay_to_status_id', 'in_reply_user_id',
    # 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'
# drop columns that are empty from above as well as 'source' and 'expanded URLs'
print(df_1.shape[0])
for index, row in df_1.iterrows():
    if str(row[1]) != 'nan': df_1.drop(index, inplace = True)
    if str(row[6]) != 'nan': df_1.drop(index, inplace = True)
print(df_1.shape[0])
cats = ['in_reply_to_status_id',
        'in_reply_to_user_id',
        'retweeted_status_id', 
        'retweeted_status_user_id', 
        'retweeted_status_timestamp',
        'expanded_urls',
        'source',]
for c in cats:
    df_1.drop(c, axis=1, inplace = True) 

2356
2097


In [475]:
# drop the rows with a numerator of 0
df_1 = df_1.drop(df_1.index[df_1.rating_numerator == 0])
df_1 = df_1.drop(df_1.index[df_1.rating_denominator == 0])
df_1.describe()

Unnamed: 0,tweet_id,rating_numerator,rating_denominator
count,2096.0,2096.0,2096.0
mean,7.365123e+17,12.195134,10.44895
std,6.708321e+16,40.37375,6.64664
min,6.660209e+17,1.0,2.0
25%,6.768177e+17,10.0,10.0
50%,7.097095e+17,11.0,10.0
75%,7.874779e+17,12.0,10.0
max,8.924206e+17,1776.0,170.0


In [476]:
# show the unique values in numerator and denominator columns
print(df_1['rating_numerator'].unique())
print(df_1['rating_denominator'].unique())

[  13   12   14    5   11    6   10   84   24   75   27    3    7    8
    9    4  165 1776  204   50   99   80   45   60   44  121   26    2
  144   88    1  420]
[ 10  70   7 150  11 170  20  50  90  80  40 110 120   2]


In [477]:
df_1.sample(3)

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2320,666437273139982337,2015-11-17 02:06:42,Here we see a lone northeastern Cumberbatch. H...,7,10,,,,,
2192,668932921458302977,2015-11-23 23:23:31,This is Herald. He likes to swing. Subtle tong...,9,10,Herald,,,,
2152,669680153564442624,2015-11-26 00:52:45,This is Shawwn. He's a Turkish Gangrene Robitu...,8,10,Shawwn,,,,


In [478]:
# normalize the rating columns to values between 0 and 1
df_1['numerator_norm'] = ((df_1['rating_numerator']-df_1['rating_numerator'].min())/(df_1['rating_numerator'].max()-df_1['rating_numerator'].min()))
df_1['denominator_norm'] = ((df_1['rating_denominator']-df_1['rating_denominator'].min())/(df_1['rating_denominator'].max()-df_1['rating_denominator'].min()))
df_1['normalized_rating'] = np.nan

df_1['normalized_rating'] = df_1['numerator_norm'] / df_1['denominator_norm']

In [479]:
# test the results
df_1.describe()

Unnamed: 0,tweet_id,rating_numerator,rating_denominator,numerator_norm,denominator_norm,normalized_rating
count,2096.0,2096.0,2096.0,2096.0,2096.0,2095.0
mean,7.365123e+17,12.195134,10.44895,0.006307,0.050291,0.126559
std,6.708321e+16,40.37375,6.64664,0.022746,0.039563,0.469434
min,6.660209e+17,1.0,2.0,0.0,0.0,0.0
25%,6.768177e+17,10.0,10.0,0.00507,0.047619,0.106479
50%,7.097095e+17,11.0,10.0,0.005634,0.047619,0.11831
75%,7.874779e+17,12.0,10.0,0.006197,0.047619,0.130141
max,8.924206e+17,1776.0,170.0,1.0,1.0,21.0


In [480]:
# remove extreme outliers
print(df_1.shape[0])
df_1 = df_1.drop(df_1.index[df_1.normalized_rating == 0])
df_1 = df_1.drop(df_1.index[df_1.numerator_norm == 0])
df_1 = df_1.drop(df_1.index[df_1.denominator_norm == 0])
df_1 = df_1.drop(df_1.index[df_1.normalized_rating > 1])
df_1.dropna(subset=['normalized_rating'], inplace = True)
print(df_1.shape[0])

2096
2089


In [481]:
df_1.describe()

Unnamed: 0,tweet_id,rating_numerator,rating_denominator,numerator_norm,denominator_norm,normalized_rating
count,2089.0,2089.0,2089.0,2089.0,2089.0,2089.0
mean,7.366985e+17,11.182384,10.454284,0.005737,0.050323,0.114497
std,6.709821e+16,8.036421,6.655158,0.004528,0.039614,0.031312
min,6.660209e+17,2.0,7.0,0.000563,0.029762,0.011831
25%,6.76917e+17,10.0,10.0,0.00507,0.047619,0.106479
50%,7.099188e+17,11.0,10.0,0.005634,0.047619,0.11831
75%,7.878106e+17,12.0,10.0,0.006197,0.047619,0.130141
max,8.924206e+17,204.0,170.0,0.114366,1.0,0.875493


In [482]:
df_1['normalized_rating'].unique()

array([0.14197183, 0.13014085, 0.15380282, 0.04732394, 0.11830986,
       0.05915493, 0.10647887, 0.1155261 , 0.43538028, 0.87549296,
       0.30760563, 0.02366197, 0.07098592, 0.0828169 , 0.09464789,
       0.03549296, 0.10488009, 0.08413146, 0.1143662 , 0.01577465,
       0.09661972, 0.10540333, 0.09586132, 0.08676056, 0.11633803,
       0.10710156, 0.10516432, 0.06309859, 0.29577465, 0.01183099,
       0.11470041, 0.1055688 ])

In [483]:
df_1.rename(columns={'text': 'description'}, inplace = True)

In [484]:
for col in ['doggo', 'floofer', 'pupper', 'puppo']:
    df_1[col] = df_1[col].astype('category')
df_1.drop(['rating_numerator', 'rating_denominator', 'numerator_norm', 'denominator_norm'], axis=1, inplace = True)
print(df_1.sample(3))
print(df_1.info())

                tweet_id           timestamp  \
2271  667495797102141441 2015-11-20 00:12:54   
588   799422933579902976 2016-11-18 01:24:14   
371   828408677031882754 2017-02-06 01:03:14   

                                            description        name doggo  \
2271  This is Philippe from Soviet Russia. Commandin...    Philippe  None   
588   This is Longfellow (prolly sophisticated). He'...  Longfellow  None   
371   This is Bear. He went outside to play in the s...        Bear  None   

     floofer pupper puppo  normalized_rating  
2271    None   None  None           0.094648  
588     None   None  None           0.130141  
371     None   None  None           0.130141  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2089 entries, 0 to 2355
Data columns (total 9 columns):
tweet_id             2089 non-null int64
timestamp            2089 non-null datetime64[ns]
description          2089 non-null object
name                 2089 non-null object
doggo                2089 non

#### Tweet Image Predictions

In [485]:
# change 'img_num' column name
df_2.rename(columns={'img_num': 'tweet_image_number'}, inplace = True)
# change this to a catagorical data type
df_2['tweet_image_number'] = df_2['tweet_image_number'].astype('category')

In [486]:
# drop columns
df_2.drop(['p1_dog', 'p2_dog', 'p3_dog'], axis=1, inplace=True)

In [487]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 9 columns):
tweet_id              2075 non-null int64
jpg_url               2075 non-null object
tweet_image_number    2075 non-null category
p1                    2075 non-null object
p1_conf               2075 non-null float64
p2                    2075 non-null object
p2_conf               2075 non-null float64
p3                    2075 non-null object
p3_conf               2075 non-null float64
dtypes: category(1), float64(3), int64(1), object(4)
memory usage: 132.0+ KB


In [488]:
# cleanup the 'p1', 'p2', and 'p3' columns
for col in ['p1', 'p2', 'p3']:
    df_2[col] = df_2[col].str.replace('_',' ')
    df_2[col] = df_2[col].str.replace('-',' ') 
    df_2[col] = df_2[col].str.title()
df_2.rename(columns={'p1': 'breed_probability_1', 'p2': 'breed_probability_2', 'p3': 'breed_probability_3'}, inplace = True)

In [489]:
df_2.sample(3)

Unnamed: 0,tweet_id,jpg_url,tweet_image_number,breed_probability_1,p1_conf,breed_probability_2,p2_conf,breed_probability_3,p3_conf
487,675501075957489664,https://pbs.twimg.com/media/CV_cnjHWUAADc-c.jpg,1,Dough,0.806757,Bakery,0.027907,French Loaf,0.018189
1665,812503143955202048,https://pbs.twimg.com/media/C0aXTLqXEAADxBi.jpg,2,Loupe,0.546856,Web Site,0.345298,Bubble,0.010528
102,667801013445750784,https://pbs.twimg.com/media/CUSBemVUEAAn-6V.jpg,1,Flat Coated Retriever,0.508392,Chesapeake Bay Retriever,0.262239,Curly Coated Retriever,0.04892


##### Scrape a list of dog breeds from [dogtime.com](https://dogtime.com/dog-breeds/profiles) to compare with df_2['p1', 'p2', 'p3']

As a note, I realize this scraped list from dogtime.com is not comprehensive and because of this there are some inconsistancies between the list and the data set the comparison is not perfect. It was more for the sake of a scraping exercise to gather more data because I could not do the Twitter API portion and to show that I idenified the errors in the Tweet Image Predictions data set. This would obviously be scrubbed more thoroughly in a real world activity.

In [490]:
#specify the URL
page = requests.get('https://dogtime.com/dog-breeds/profiles')
# check the response
page

<Response [200]>

In [491]:
# check that it worked
page.status_code

200

In [492]:
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="bc06c90d1acb18ed0abe8e9b9c02db20" name="p:domain_verify">
   <meta content="width=device-width, initial-scale=1, maximum-scale=1, minimal-ui" name="viewport"/>
   <title>
    All Dog Breeds - Complete List of Dog Profiles
   </title>
   <style>
    *,.top-nav .menu{box-sizing:border-box}.coverpanel,.coverpanel-wrapper,.ir,.toggle-menu,.top-logo,.top-nav .menu .menu-item,svg:not(:root){overflow:hidden}.search-field,img{vertical-align:middle}.fb_reset,.top-logo,a{text-decoration:none}.home .content{padding:0 8px 8px}.home.main .widget_pb_coverpanel_widget{margin:0 -8px;min-height:314px}.sidebar-top,.top-300-promo,.top-300-promo .promo{min-height:250px}.coverpanel{position:relative;width:100%;max-width:414px;margin:0 auto;max-height:370px}.ho

In [493]:
# create the list of dog breeds
breeds = []
soup_dump = soup.findAll('a', class_ = 'post-title')
for breed in soup_dump:
    breed = str(breed).split('>', 1)
    breed = breed[1].split('<', 1)
    breeds.append(breed[0])
breeds

['Affenpinscher',
 'Afghan Hound',
 'Airedale Terrier',
 'Akita',
 'Alaskan Klee Kai',
 'Alaskan Malamute',
 'American Bulldog',
 'American English Coonhound',
 'American Eskimo Dog',
 'American Foxhound',
 'American Pit Bull Terrier',
 'American Staffordshire Terrier',
 'American Water Spaniel',
 'Anatolian Shepherd Dog',
 'Appenzeller Sennenhunde',
 'Australian Cattle Dog',
 'Australian Kelpie',
 'Australian Shepherd',
 'Australian Terrier',
 'Azawakh',
 'Barbet',
 'Basenji',
 'Basset Hound',
 'Beagle',
 'Bearded Collie',
 'Bedlington Terrier',
 'Belgian Malinois',
 'Belgian Sheepdog',
 'Belgian Tervuren',
 'Berger Picard',
 'Bernedoodle',
 'Bernese Mountain Dog',
 'Bichon Frise',
 'Black and Tan Coonhound',
 'Black Mouth Cur',
 'Black Russian Terrier',
 'Bloodhound',
 'Blue Lacy',
 'Bluetick Coonhound',
 'Boerboel',
 'Bolognese',
 'Border Collie',
 'Border Terrier',
 'Borzoi',
 'Boston Terrier',
 'Bouvier des Flandres',
 'Boxer',
 'Boykin Spaniel',
 'Bracco Italiano',
 'Briard',
 'B

In [494]:
df_2.shape

(2075, 9)

In [495]:
# check the predicted dog breed values again the scraped list and mark as 'NaN' if not matches,
# then drop the rows that all three are 'NaN'
for index, row in df_2.iterrows():
    if not any(breed == row[3] for breed in breeds):
        row[3] = 'NaN'
    df_2.loc[index, 'breed_probability_1'] = row[3]
    if not any(breed == row[5] for breed in breeds):
        row[5] = 'NaN'
    df_2.loc[index, 'breed_probability_2'] = row[5]
    if not any(breed == row[7] for breed in breeds):
        row[7] = 'NaN'
    df_2.loc[index, 'breed_probability_3'] = row[7]
    if df_2['breed_probability_1'][index] and df_2['breed_probability_2'][index] and df_2['breed_probability_3'][index] == 'NaN': df_2.drop(index, inplace=True)

In [496]:
df_2.head(10)

Unnamed: 0,tweet_id,jpg_url,tweet_image_number,breed_probability_1,p1_conf,breed_probability_2,p2_conf,breed_probability_3,p3_conf
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh Springer Spaniel,0.465074,Collie,0.156665,Shetland Sheepdog,0.061428
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,,0.506826,Miniature Pinscher,0.074192,Rhodesian Ridgeback,0.07201
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,,0.596461,,0.138584,Bloodhound,0.116197
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian Ridgeback,0.408143,,0.360687,Miniature Pinscher,0.222752
5,666050758794694657,https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg,1,Bernese Mountain Dog,0.651137,,0.263788,Greater Swiss Mountain Dog,0.016199
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,,0.962465,,0.014594,Golden Retriever,0.007959
9,666058600524156928,https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg,1,,0.201493,Komondor,0.192305,Soft Coated Wheaten Terrier,0.082086
10,666063827256086533,https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg,1,Golden Retriever,0.77593,Tibetan Mastiff,0.093718,Labrador Retriever,0.072427
12,666073100786774016,https://pbs.twimg.com/media/CT5d9DZXAAALcwe.jpg,1,,0.260857,English Foxhound,0.175382,Ibizan Hound,0.097471
13,666082916733198337,https://pbs.twimg.com/media/CT5m4VGWEAAtKc8.jpg,1,Pug,0.489814,,0.404722,French Bulldog,0.04896


In [497]:
df_2.shape

(954, 9)

In [498]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 954 entries, 0 to 2073
Data columns (total 9 columns):
tweet_id               954 non-null int64
jpg_url                954 non-null object
tweet_image_number     954 non-null category
breed_probability_1    954 non-null object
p1_conf                954 non-null float64
breed_probability_2    954 non-null object
p2_conf                954 non-null float64
breed_probability_3    954 non-null object
p3_conf                954 non-null float64
dtypes: category(1), float64(3), int64(1), object(4)
memory usage: 68.2+ KB


In [499]:
# create new columns with 'most probable' dog breed and and confidence level. Drop the originals
df_2['most_probable_breed'] = ''
df_2['breed_confidence'] = ''
length = len(df_2)
for index, row in df_2.iterrows():
    if df_2['breed_probability_1'][index] != 'NaN':
        df_2.loc[index, 'most_probable_breed'] = row[3]
        df_2.loc[index, 'breed_confidence'] = row[4]
    else:
        if df_2['breed_probability_2'][index] != 'NaN':
            df_2.loc[index, 'most_probable_breed'] = row[5]
            df_2.loc[index, 'breed_confidence'] = row[6]
        else:
            df_2.loc[index, 'most_probable_breed'] = row[7]
            df_2.loc[index, 'breed_confidence'] = row[8]
# drop columns
df_2.drop(['breed_probability_1',
           'p1_conf',
           'breed_probability_2',
           'p2_conf',
           'breed_probability_3',
           'p3_conf'], axis=1, inplace=True)
# change the breed_confidence feature to a float
df_2['breed_confidence'] = df_2['breed_confidence'].astype('float')
df_2.head(10)

Unnamed: 0,tweet_id,jpg_url,tweet_image_number,most_probable_breed,breed_confidence
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh Springer Spaniel,0.465074
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,Miniature Pinscher,0.074192
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,Bloodhound,0.116197
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian Ridgeback,0.408143
5,666050758794694657,https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg,1,Bernese Mountain Dog,0.651137
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,Golden Retriever,0.007959
9,666058600524156928,https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg,1,Komondor,0.192305
10,666063827256086533,https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg,1,Golden Retriever,0.77593
12,666073100786774016,https://pbs.twimg.com/media/CT5d9DZXAAALcwe.jpg,1,English Foxhound,0.175382
13,666082916733198337,https://pbs.twimg.com/media/CT5m4VGWEAAtKc8.jpg,1,Pug,0.489814


#### Twitter API Query

In [500]:
# rename 'id' column for merging
df_3.rename(columns={'id': 'tweet_id'}, inplace = True)
df_3.sample(1)

Unnamed: 0,favorite_count,tweet_id,retweet_count
144,15935,863553081350529029,4489


## Merging

In [501]:
# check the df shapes
print(df_1.shape)
print(df_2.shape)
print(df_3.shape)

(2089, 9)
(954, 5)
(838, 3)


In [502]:
# check the dataframes are complete
print(df_1.isnull().values.any())
print(df_2.isnull().values.any())
print(df_3.isnull().values.any())

False
False
False


In [503]:
# perform the merging
df = pd.merge(df_2, df_1, on='tweet_id')
print(df.shape)
df = pd.merge(df, df_3, on='tweet_id')
print(df.shape)

(908, 13)
(298, 15)


In [504]:
print(df.isnull().values.any())

False


In [505]:
df.head(5)

Unnamed: 0,tweet_id,jpg_url,tweet_image_number,most_probable_breed,breed_confidence,timestamp,description,name,doggo,floofer,pupper,puppo,normalized_rating,favorite_count,retweet_count
0,767500508068192258,https://pbs.twimg.com/media/Cqa1ofnXEAAG0yn.jpg,1,Golden Retriever,0.165063,2016-08-21 23:15:55,This is Louie. He's making quite a h*ckin mess...,Louie,,,,,0.130141,8295,2688
1,767754930266464257,https://pbs.twimg.com/media/CqedCQWWgAIab9L.jpg,1,Vizsla,0.307794,2016-08-22 16:06:54,This is Philbert. His toilet broke and he does...,Philbert,,,,,0.11831,17814,6221
2,768596291618299904,https://pbs.twimg.com/media/CqqaPjqWIAAOyNL.jpg,1,Great Pyrenees,0.729745,2016-08-24 23:50:10,Say hello to Oakley and Charlie. They're convi...,Oakley,,,,,0.130141,5592,1473
3,768609597686943744,https://pbs.twimg.com/media/CqqmWa7WcAAIM-n.jpg,1,Basenji,0.183283,2016-08-25 00:43:02,This is Lou. His sweater is too small and he a...,Lou,,,,,0.106479,4580,1382
4,768855141948723200,https://pbs.twimg.com/media/CquFrCKWAAAr32m.jpg,1,Rottweiler,0.055114,2016-08-25 16:58:45,This is Jesse. He really wants a belly rub. Wi...,Jesse,,,,,0.11831,4660,1034


## Storing, Analyzing, and Visualizing Data for this Project

In [506]:
# save the dataframe
df.to_csv('twitter_archive_master.csv')

In [507]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298 entries, 0 to 297
Data columns (total 15 columns):
tweet_id               298 non-null int64
jpg_url                298 non-null object
tweet_image_number     298 non-null category
most_probable_breed    298 non-null object
breed_confidence       298 non-null float64
timestamp              298 non-null datetime64[ns]
description            298 non-null object
name                   298 non-null object
doggo                  298 non-null category
floofer                298 non-null category
pupper                 298 non-null category
puppo                  298 non-null category
normalized_rating      298 non-null float64
favorite_count         298 non-null int64
retweet_count          298 non-null int64
dtypes: category(5), datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 27.6+ KB


In [508]:
df.describe()

Unnamed: 0,tweet_id,breed_confidence,normalized_rating,favorite_count,retweet_count
count,298.0,298.0,298.0,298.0,298.0
mean,8.254651e+17,0.423249,0.132942,19514.647651,5190.130872
std,3.50669e+16,0.344811,0.022503,15657.117271,5611.708759
min,7.675005e+17,5.6e-05,0.023662,3005.0,546.0
25%,7.962085e+17,0.086408,0.130141,10098.5,2390.25
50%,8.21826e+17,0.356173,0.130141,14587.5,3551.0
75%,8.539478e+17,0.738836,0.141972,24539.75,5662.0
max,8.921774e+17,0.999828,0.43538,132810.0,48265.0


In [509]:
count = df.groupby('doggo').doggo.count()
print('doggo count ' + str(count[1]))
count = df.groupby('floofer').floofer.count()
print('floofer count  ' + str(count[1]))
count = df.groupby('pupper').pupper.count()
print('pupper count  ' + str(count[1]))
count = df.groupby('puppo').puppo.count()
print('puppo count  ' + str(count[1]))

doggo count 24
floofer count  4
pupper count  15
puppo count  7


In [520]:
# remaining observations with non NaN values in dog 'stage' descriptions vs how many we started with
count_1 = df.groupby('doggo').doggo.count()[1] + df.groupby('floofer').floofer.count()[1] + df.groupby('pupper').pupper.count()[1] + df.groupby('puppo').puppo.count()[1]
print((df.shape[1]/count_1)*100)
count_2 = df_1.groupby('doggo').doggo.count()[1] + df_1.groupby('floofer').floofer.count()[1] + df_1.groupby('pupper').pupper.count()[1] + df_1.groupby('puppo').puppo.count()[1]
print((df_1.shape[1]/count_2)*100)

30.0
2.5936599423631126
