In [1]:
import os
import io
import json
import pandas as pd
import numpy as np
import matplotlib as plt
import requests
import tweepy

In [2]:
consumer_key = os.environ['api_key']
consumer_secret = os.environ['api_secret_key']
access_token = os.environ['access_token']
access_secret = os.environ['access_token_secret']

In [2]:
csv_file_name = 'twitter-archive-enhanced.csv'

In [3]:
udacity_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

## Gather

In [4]:
df_tweets = pd.read_csv(csv_file_name)

In [5]:
image_data = requests.get(udacity_url).content

In [6]:
df_images = pd.read_csv(io.StringIO(image_data.decode('utf-8')), sep='\t')

In [4]:
# Set up tweepy with credentials.
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [45]:
with open('tweet_json.txt', 'w') as file:
    for id in df_tweets['tweet_id']:
        try:
            tweet_info = api.get_status(id, tweet_mode='extended')
            json.dump(tweet_info._json, file)
            # ensure each entry on a new line
            file.write('\n')
        except tweepy.TweepError:
            # if it is not possible to find a tweet do nothing
            pass

Rate limit reached. Sleeping for: 508


In [7]:
tweets_data = []
with open('tweet_json.txt') as file:
    for line in file:
        # get rid of \n when adding to a list of dicts
        tweets_data.append(json.loads(line.replace('\n', '')))

In [8]:
# load twitter api data to Pandas
df_tweets_data = pd.DataFrame(tweets_data)

In [9]:
len(df_tweets_data), len(df_tweets)

(2340, 2356)

We've downloaded via API data on almost all of the tweets in the original DataFrame.

## Assess

Let's check what we've got in df_tweets:

In [9]:
df_tweets.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [40]:
df_tweets.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


Let's check if datatypes are apropriate:

In [10]:
df_tweets.dtypes

tweet_id                        int64
in_reply_to_status_id         float64
in_reply_to_user_id           float64
timestamp                      object
source                         object
text                           object
retweeted_status_id           float64
retweeted_status_user_id      float64
retweeted_status_timestamp     object
expanded_urls                  object
rating_numerator                int64
rating_denominator              int64
name                           object
doggo                          object
floofer                        object
pupper                         object
puppo                          object
dtype: object

Let's check what we have in non null rows in replies:

In [11]:
df_tweets[~df_tweets['in_reply_to_status_id'].isnull()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
30,886267009285017600,8.862664e+17,2.281182e+09,2017-07-15 16:51:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@NonWhiteHat @MayhewMayhem omg hello tanner yo...,,,,,12,10,,,,,
55,881633300179243008,8.816070e+17,4.738443e+07,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@roushfenway These are good dogs but 17/10 is ...,,,,,17,10,,,,,
64,879674319642796034,8.795538e+17,3.105441e+09,2017-06-27 12:14:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@RealKentMurphy 14/10 confirmed,,,,,14,10,,,,,
113,870726314365509632,8.707262e+17,1.648776e+07,2017-06-02 19:38:25 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@ComplicitOwl @ShopWeRateDogs &gt;10/10 is res...,,,,,10,10,,,,,
148,863427515083354112,8.634256e+17,7.759620e+07,2017-05-13 16:15:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@Jack_Septic_Eye I'd need a few more pics to p...,,,,,12,10,,,,,
149,863079547188785154,6.671522e+17,4.196984e+09,2017-05-12 17:12:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Ladies and gentlemen... I found Pipsy. He may ...,,,,https://twitter.com/dog_rates/status/863079547...,14,10,,,,,
179,857214891891077121,8.571567e+17,1.806710e+08,2017-04-26 12:48:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@Marc_IRL pixelated af 12/10,,,,,12,10,,,,,
184,856526610513747968,8.558181e+17,4.196984e+09,2017-04-24 15:13:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...","THIS IS CHARLIE, MARK. HE DID JUST WANT TO SAY...",,,,https://twitter.com/dog_rates/status/856526610...,14,10,,,,,
186,856288084350160898,8.562860e+17,2.792810e+08,2017-04-23 23:26:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@xianmcguire @Jenna_Marbles Kardashians wouldn...,,,,,14,10,,,,,
188,855862651834028034,8.558616e+17,1.943518e+08,2017-04-22 19:15:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@dhmontgomery We also gave snoop dogg a 420/10...,,,,,420,10,,,,,


Looks like 'source' is type of tweeter client used:

In [12]:
df_tweets['source'].value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

Let's check some expanded_urls:

In [13]:
df_tweets.iloc[10]['expanded_urls']

'https://twitter.com/dog_rates/status/890006608113172480/photo/1,https://twitter.com/dog_rates/status/890006608113172480/photo/1'

It seems that in some cases url is duplicated.

Checking 'dog stage' columns:

In [14]:
columns = ['doggo', 'floofer', 'pupper', 'puppo']

In [15]:
for column in columns:
    print(df_tweets[column].value_counts())

None     2259
doggo      97
Name: doggo, dtype: int64
None       2346
floofer      10
Name: floofer, dtype: int64
None      2099
pupper     257
Name: pupper, dtype: int64
None     2326
puppo      30
Name: puppo, dtype: int64


Let's check names:

In [16]:
df_tweets['name'].value_counts()

None           745
a               55
Charlie         12
Lucy            11
Cooper          11
Oliver          11
Tucker          10
Lola            10
Penny           10
Bo               9
Winston          9
the              8
Sadie            8
Buddy            7
an               7
Daisy            7
Toby             7
Bailey           7
Jack             6
Koda             6
Jax              6
Scout            6
Milo             6
Stanley          6
Leo              6
Bella            6
Oscar            6
Rusty            6
Dave             6
Phil             5
              ... 
Chesterson       1
Chef             1
Scott            1
such             1
Fiji             1
Maisey           1
Pawnd            1
Julio            1
Carll            1
Chase            1
Sweets           1
General          1
Obie             1
Christoper       1
Ruffles          1
Aubie            1
Aldrick          1
William          1
Charl            1
Lilli            1
Geoff            1
Shakespeare 

Dog names like 'a', 'the', 'an' are not valid names. Also absent data was marked as a string 'None' which is also not apropriate.

Now let's check ratings:

In [28]:
df_tweets['rating_numerator'].value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [33]:
df_tweets['rating_denominator'].value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

Because WeRateDogs uses quite an unique ratings system it is very hard to assess it's correctness however we can assume that denominator larger than 10 is a mistake, for example this [tweet](https://twitter.com/dog_rates/status/740373189193256964/photo/1) has rating 9/11 however this is actually a date and true rating is 14/10.

In conclusion let's check if we have duplicated rows:

In [39]:
len(df_tweets[df_tweets.duplicated()])

0

Let's check df_images:

In [24]:
df_images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [18]:
df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [19]:
df_images.dtypes

tweet_id      int64
jpg_url      object
img_num       int64
p1           object
p1_conf     float64
p1_dog         bool
p2           object
p2_conf     float64
p2_dog         bool
p3           object
p3_conf     float64
p3_dog         bool
dtype: object

Let's check what breeds we have:

In [42]:
columns = ['p1', 'p2', 'p3']
for column in columns:
    print(df_images[column].value_counts())

golden_retriever             150
Labrador_retriever           100
Pembroke                      89
Chihuahua                     83
pug                           57
chow                          44
Samoyed                       43
toy_poodle                    39
Pomeranian                    38
cocker_spaniel                30
malamute                      30
French_bulldog                26
miniature_pinscher            23
Chesapeake_Bay_retriever      23
seat_belt                     22
Staffordshire_bullterrier     20
Siberian_husky                20
German_shepherd               20
web_site                      19
Cardigan                      19
beagle                        18
Eskimo_dog                    18
Shetland_sheepdog             18
teddy                         18
Maltese_dog                   18
Lakeland_terrier              17
Rottweiler                    17
Shih-Tzu                      17
kuvasz                        16
Italian_greyhound             16
          

With dog breed we got some other items like 'lifeboat', 'barbershop' etc but this is to be expected it seems that image recognition model is not limited to dogs. What we can notice here is that some dog breeds capitalised and some are not i.e. 'miniature_poodle' and 'Siberian_husky'.

Let's check how image classifer actually worked:

In [23]:
df_images[df_images['p1_dog'] == False]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,4.588540e-02,False,terrapin,1.788530e-02,False
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,1.459380e-02,False,golden_retriever,7.958960e-03,True
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,3.391940e-02,False,partridge,5.206580e-05,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,8.554740e-02,False,bookcase,7.947970e-02,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,1.525000e-02,False,great_grey_owl,1.320720e-02,False
22,666337882303524864,https://pbs.twimg.com/media/CT9OwFIWEAMuRje.jpg,1,ox,0.416669,False,Newfoundland,2.784070e-01,True,groenendael,1.026430e-01,True
25,666362758909284353,https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg,1,guinea_pig,0.996496,False,skunk,2.402450e-03,False,hamster,4.608630e-04,False
29,666411507551481857,https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg,1,coho,0.404640,False,barracouta,2.714850e-01,False,gar,1.899450e-01,False
33,666430724426358785,https://pbs.twimg.com/media/CT-jNYqW4AAPi2M.jpg,1,llama,0.505184,False,Irish_terrier,1.041090e-01,True,dingo,6.207120e-02,False
43,666776908487630848,https://pbs.twimg.com/media/CUDeDoWUYAAD-EM.jpg,1,seat_belt,0.375057,False,miniature_pinscher,1.671750e-01,True,Chihuahua,8.695060e-02,True


It seems that WeRateDogs posts not only dogs but other animals too here is a [turtle](https://twitter.com/dog_rates/status/666051853826850816)

Anyway for the purpose of this project out of three columns we need only one with actually dogs in it.

In [43]:
df_images.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [45]:
columns = ['p1_dog', 'p2_dog', 'p3_dog']
for column in columns:
    print(df_images[column].value_counts())

True     1532
False     543
Name: p1_dog, dtype: int64
True     1553
False     522
Name: p2_dog, dtype: int64
True     1499
False     576
Name: p3_dog, dtype: int64


Let's check for duplicates:

In [47]:
len(df_images[df_images.duplicated()])

0

Let's see what we've abtained via twitter-API:

In [11]:
df_tweets_data.columns

Index(['contributors', 'coordinates', 'created_at', 'display_text_range',
       'entities', 'extended_entities', 'favorite_count', 'favorited',
       'full_text', 'geo', 'id', 'id_str', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status',
       'lang', 'place', 'possibly_sensitive', 'possibly_sensitive_appealable',
       'quoted_status', 'quoted_status_id', 'quoted_status_id_str',
       'quoted_status_permalink', 'retweet_count', 'retweeted',
       'retweeted_status', 'source', 'truncated', 'user'],
      dtype='object')

In [15]:
df_tweets_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2340 entries, 0 to 2339
Data columns (total 32 columns):
contributors                     0 non-null object
coordinates                      0 non-null object
created_at                       2340 non-null object
display_text_range               2340 non-null object
entities                         2340 non-null object
extended_entities                2067 non-null object
favorite_count                   2340 non-null int64
favorited                        2340 non-null bool
full_text                        2340 non-null object
geo                              0 non-null object
id                               2340 non-null int64
id_str                           2340 non-null object
in_reply_to_screen_name          77 non-null object
in_reply_to_status_id            77 non-null float64
in_reply_to_status_id_str        77 non-null object
in_reply_to_user_id              77 non-null float64
in_reply_to_user_id_str          77 non-null obj

We got a bunch of data on individual tweets, but most interesting to us is retweet_count and favorite_count:

In [17]:
df_tweets_data[['id', 'retweet_count','favorite_count']].sample(5)

Unnamed: 0,id,retweet_count,favorite_count
1894,674416750885273600,146,700
38,884876753390489601,5491,27416
544,803692223237865472,8170,0
81,876838120628539392,3283,20338
1147,723688335806480385,3167,8029


In [18]:
df_tweets_data[['retweet_count', 'favorite_count']].describe()

Unnamed: 0,retweet_count,favorite_count
count,2340.0,2340.0
mean,2928.622222,7960.525214
std,4933.774449,12328.813228
min,0.0,0.0
25%,587.75,1371.75
50%,1367.5,3460.5
75%,3412.75,9740.0
max,83635.0,164249.0


df_tweets_data seems to be a clean and tidy dataset atleast in parts which matter for this project.

### Quality

<ul>
    <li><b>For the tweet archive:</b></li>
    <li>Timestamp is a string, should be datetime object</li>
    <li>Id's in in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id and retweeted_status_user_id are floats should be integers</li>
    <li>Tags in source column, do not contribute usefull info, needs to be changed to iphone, vine, web, tweetdeck</li>
    <li>Some names are obviously invalid like 'a', 'the', 'an'</li>
    <li>Missing values in names column labeled as string 'None'</li>
    <li>Rating denominators should be not larger than 10</li>
    <li>Missing values in 'dog stages' labeled as string 'None'</li>
    <li><b>For image prediction DataFrame:</b></li>
    <li>Some dog breed capitalised and some are not</li>
    <li>Out of three columns we need only one - actually with a dog breed in it</li>
</ul>

### Tidiness

<ul>
    <li><b>For the tweet archive:</b></li>
    <li>In expanded urls some of the url's are duplicated</li>
    <li>Columns 'doggo', 'floofer', 'pupper', 'puppo' are realy just one variable: dog stage as per Dogtionary</li>
    <li>Replies and retweets are present</li>
    <li>Since we have a single unit of observation - tweet we should put all info in one DataFrame (merging all three DataFrames together)</li>
</ul>

## Clean

First we have to deal with tidiness issues than move to quality issues.

### Define

Merge all three data sets together.

### Clean

In [39]:
df_tweets_data_clean = df_tweets_data.copy()
# Rename id column for merging.
df_tweets_data_clean.rename(index=str, columns={'id': 'tweet_id'}, inplace=True)

In [49]:
# merge with data from twitter API:
df_twitter_archive_master = pd.merge(df_tweets,
                                     df_tweets_data_clean[['tweet_id', 'retweet_count', 'favorite_count']],
                                     how='left')

### Test

In [52]:
df_twitter_archive_master.tail()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,retweet_count,favorite_count
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,,41.0,107.0
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,,139.0,292.0
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,,43.0,123.0
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,,47.0,126.0
2355,666020888022790149,,,2015-11-15 22:32:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a Japanese Irish Setter. Lost eye...,,,,https://twitter.com/dog_rates/status/666020888...,8,10,,,,,,502.0,2537.0


Looks like successful merge If we check about 16 tweets are missing data because they were deleted at some point.

In [53]:
len(df_twitter_archive_master[df_twitter_archive_master['retweet_count'].isnull()])

16

Let's drop deleted tweets:

In [61]:
df_twitter_archive_master = df_twitter_archive_master.dropna(subset=['retweet_count', 'favorite_count']).reset_index()

But for some reason ints in favorite_count and retweet_count were converted to floats, let's convert them back to ints.

In [63]:
df_twitter_archive_master['retweet_count'] = df_twitter_archive_master['retweet_count'].astype(int)
df_twitter_archive_master['favorite_count'] = df_twitter_archive_master['favorite_count'].astype(int)

In [65]:
df_twitter_archive_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2340 entries, 0 to 2339
Data columns (total 20 columns):
index                         2340 non-null int64
tweet_id                      2340 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2340 non-null object
source                        2340 non-null object
text                          2340 non-null object
retweeted_status_id           167 non-null float64
retweeted_status_user_id      167 non-null float64
retweeted_status_timestamp    167 non-null object
expanded_urls                 2281 non-null object
rating_numerator              2340 non-null int64
rating_denominator            2340 non-null int64
name                          2340 non-null object
doggo                         2340 non-null object
floofer                       2340 non-null object
pupper                        2340 non-null object
puppo                         234

### Define

Convert timestamp string to datetime object.

### Clean

In [58]:
df_tweets_clean['timestamp'] = pd.to_datetime(df_tweets_clean['timestamp'])

### Test

In [64]:
df_tweets_clean['timestamp'].dtype

dtype('<M8[ns]')

### Define

Since we are not interested in replies or retweets let's delete them from dataframe completely and drop these columns.

### Clean

In [104]:
columns = ['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id']
for column in columns:
    df_tweets_clean = df_tweets_clean[df_tweets_clean[column].isnull()]
df_tweets_clean.drop(columns, axis=1, inplace=True)

### Test

In [105]:
# check that we didn't drop everything
df_tweets_clean.head()

Unnamed: 0,tweet_id,timestamp,source,text,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [108]:
# How many rows we've dropped?
len(df_tweets) - len(df_tweets_clean)

259

## Analyse