# Project: Wrangling and Analyze Data

In [1]:
# Import necessary python libraries.
import pandas as pd
import requests 
import os
import matplotlib.pyplot as plt

## Data Gathering
In the cell below, gather **all** three pieces of data for this project and load them in the notebook. **Note:** the methods required to gather each data are different.
1. Directly download the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv)

In [2]:
twitter_archive = pd.read_csv('data/twitter-archive-enhanced.csv')

In [3]:
twitter_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


2. Use the Requests library to download the tweet image prediction (image_predictions.tsv)

In [4]:
# Save file to download folder
folder_name = 'data'

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open(os.path.join(folder_name, url.split('/')[-1]), 'wb') as file:
    file.write(response.content)

In [5]:
image_prediction = pd.read_csv('data/image-predictions.tsv', sep='\t')
image_prediction.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


3. Use the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [6]:
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = twitter_archive_enhanced.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

NameError: name 'twitter_archive_enhanced' is not defined

In [7]:
folder_name = 'data'
df_list = []

all_tweet = [json.loads(line) for line in open(os.path.join(folder_name, 'tweet-json.txt'))]
for tweet in all_tweet:
    tweet_id = tweet['id']
    text = tweet['full_text']
    only_url = text[text.find('https'):] 
    retweet_count = tweet['retweet_count']
    favorite_count = tweet['favorite_count']
    followers_count = tweet['user']['followers_count']
    friends_count = tweet['user']['friends_count']
    whole_source = tweet['source']
    source=whole_source[whole_source.find('rel="nofollow">') + 15:-4]
    retweeted = tweet.get('retweeted', 'This is a retweet')
    if retweeted == False:
        retweeted_status = 'Original tweet'
    else:
        retweeted_status = retweeted
    

    
        
    df_list.append({'tweet_id': tweet_id,
                    'url': only_url,
                    'retweet_count': retweet_count,
                    'favorite_count': favorite_count,
                    'followers_count': followers_count,
                    'friends_count': friends_count,
                    'source': source,
                    'retweeted_status': retweeted_status})
        
tweet_json = pd.DataFrame(df_list, columns = ['tweet_id', 'retweet_count', 'favorite_count', 'followers_count',
                                              'friends_count', 'source', 'retweeted_status', 'url'])

# List of dictionaries to build file by file and later convert to a DataFrame
folder_name = 'data'
#all_tweet = []
df_list = []

all_tweet = [json.loads(line) for line in open(os.path.join(folder_name, 'tweet-json.txt'))]
#with open(os.path.join(folder_name, 'tweet-json.txt'), encoding='utf-8') as json_file:
    #print(type(json_file))
    #all_tweet = json.load(json_file)
for tweet in all_tweet:
    tweet_id = tweet['id']
    text = tweet['full_text']
    only_url = text[text.find('https'):] 
    retweet_count = tweet['retweet_count']
    favorite_count = tweet['favorite_count']
    followers_count = tweet['user']['followers_count']
    friends_count = tweet['user']['friends_count']
    whole_source = tweet['source']
    source=whole_source[whole_source.find('rel="nofollow">') + 15:-4]
    retweeted_status = tweet['retweeted_status'] = tweet.get('retweeted_status', 'Orignal tweet')
    re = tweet['retweeted_status']
    print(re)
    if retweeted_status == 'Original tweet':
        url = only_url
        print(retweeted_status)
    else:
        retweeted_status = 'This is a retweet'
        url = only_url
        print(retweeted_status)
    #print(retweeted_status)
    #break
        
        
        df_list.append({'tweet_id': tweet_id,
                        'url': url,
                        'retweet_count': retweet_count,
                        'favorite_count': favorite_count,
                        'followers_count': followers_count,
                        'friends_count': friends_count,
                        'source': source,
                        'retweeted_status': retweeted_status})
        
tweet_json = pd.DataFrame(df_list, columns = ['tweet_id', 'retweet_count', 'favorite_count', 'followers_count',
                                              'friends_count', 'source', 'retweeted_status', 'url'])

## Assessing Data
In this section, detect and document at least **eight (8) quality issues and two (2) tidiness issue**. You must use **both** visual assessment
programmatic assessement to assess the data.

**Note:** pay attention to the following key points when you access the data.

* You only want original ratings (no retweets) that have images. Though there are 5000+ tweets in the dataset, not all are dog ratings and some are retweets.
* Assessing and cleaning the entire dataset completely would require a lot of time, and is not necessary to practice and demonstrate your skills in data wrangling. Therefore, the requirements of this project are only to assess and clean at least 8 quality issues and at least 2 tidiness issues in this dataset.
* The fact that the rating numerators are greater than the denominators does not need to be cleaned. This [unique rating system](http://knowyourmeme.com/memes/theyre-good-dogs-brent) is a big part of the popularity of WeRateDogs.
* You do not need to gather the tweets beyond August 1st, 2017. You can, but note that you won't be able to gather the image predictions for these tweets since you don't have access to the algorithm used.



In [8]:
# increasing the column width so that the whole text in the 'text' column is visible
pd.set_option('display.max_colwidth', None)

* ##### `Visual assessment`: 
Each piece of gathered data is displayed for visual assessment purposes.

In [9]:
twitter_archive

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,,,https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,,,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,,,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,,,"https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here we have a 1949 1st generation vulpix. Enjoys sweat tea and Fox News. Cannot be phased. 5/10 https://t.co/4B7cOc1EDq,,,,https://twitter.com/dog_rates/status/666049248165822465/photo/1,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a purebred Piers Morgan. Loves to Netflix and chill. Always looks like he forgot to unplug the iron. 6/10 https://t.co/DWnyCjf2mx,,,,https://twitter.com/dog_rates/status/666044226329800704/photo/1,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here is a very happy pup. Big fan of well-maintained decks. Just look at that tongue. 9/10 would cuddle af https://t.co/y671yMhoiR,,,,https://twitter.com/dog_rates/status/666033412701032449/photo/1,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a western brown Mitsubishi terrier. Upset about leaf. Actually 2 dogs here. 7/10 would walk the shit out of https://t.co/r7mOb2m0UI,,,,https://twitter.com/dog_rates/status/666029285002620928/photo/1,7,10,a,,,,


In [10]:
image_prediction

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [11]:
tweet_json

Unnamed: 0,tweet_id,retweet_count,favorite_count,followers_count,friends_count,source,retweeted_status,url
0,892420643555336193,8853,39467,3200889,104,Twitter for iPhone,Original tweet,https://t.co/MgUWQ76dJU
1,892177421306343426,6514,33819,3200889,104,Twitter for iPhone,Original tweet,https://t.co/0Xxu71qeIV
2,891815181378084864,4328,25461,3200889,104,Twitter for iPhone,Original tweet,https://t.co/wUnZnhtVJB
3,891689557279858688,8964,42908,3200889,104,Twitter for iPhone,Original tweet,https://t.co/tD36da7qLQ
4,891327558926688256,9774,41048,3200889,104,Twitter for iPhone,Original tweet,https://t.co/AtUZn91f7f
...,...,...,...,...,...,...,...,...
2349,666049248165822465,41,111,3201018,104,Twitter for iPhone,Original tweet,https://t.co/4B7cOc1EDq
2350,666044226329800704,147,311,3201018,104,Twitter for iPhone,Original tweet,https://t.co/DWnyCjf2mx
2351,666033412701032449,47,128,3201018,104,Twitter for iPhone,Original tweet,https://t.co/y671yMhoiR
2352,666029285002620928,48,132,3201018,104,Twitter for iPhone,Original tweet,https://t.co/r7mOb2m0UI


* #### `Programmatic assessment`: 
Pandas' functions and/or methods are used to assess the data.

In [12]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [13]:
image_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [14]:
tweet_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_id          2354 non-null   int64 
 1   retweet_count     2354 non-null   int64 
 2   favorite_count    2354 non-null   int64 
 3   followers_count   2354 non-null   int64 
 4   friends_count     2354 non-null   int64 
 5   source            2354 non-null   object
 6   retweeted_status  2354 non-null   object
 7   url               2354 non-null   object
dtypes: int64(5), object(3)
memory usage: 147.2+ KB


#### Twitter Archive Assessment

In [15]:
twitter_archive.rating_numerator.value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

### numerator rating above 100 and 

In [16]:
print(twitter_archive.loc[twitter_archive.rating_numerator == 420, 'text'])
print(twitter_archive.loc[twitter_archive.rating_numerator == 165, 'text'])
print(twitter_archive.loc[twitter_archive.rating_numerator == 144, 'text'])
print(twitter_archive.loc[twitter_archive.rating_numerator == 182, 'text'])
print(twitter_archive.loc[twitter_archive.rating_numerator == 143, 'text']) 
print(twitter_archive.loc[twitter_archive.rating_numerator == 666, 'text']) 
print(twitter_archive.loc[twitter_archive.rating_numerator == 960, 'text']) 
print(twitter_archive.loc[twitter_archive.rating_numerator == 1776, 'text'])
print(twitter_archive.loc[twitter_archive.rating_numerator == 121, 'text'])
print(twitter_archive.loc[twitter_archive.rating_numerator == 204, 'text'])
print(twitter_archive.loc[twitter_archive.rating_numerator == 0, 'text'])

188     @dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research
2074       After so many requests... here you go.\n\nGood dogg. 420/10 https://t.co/yfAAo1gdeY
Name: text, dtype: object
902    Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE
Name: text, dtype: object
1779    IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq
Name: text, dtype: object
290    @markhoppus 182/10
Name: text, dtype: object
1634    Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3
Name: text, dtype: object
189    @s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10
Name: text, dtype: object
313    @jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho
Name: text, dtype: object
979    This is Atticus. He's quite simply America af. 1

In [17]:
# Print the whole text to verify numerators
# no picture, this will be ignored when cleaning data
print(twitter_archive['text'][188])
print(twitter_archive['text'][189])
print(twitter_archive['text'][290])

# just a tweet to explain actual ratings, this will be ignored when cleaning
print(twitter_archive['text'][313])
print('*' * 25)

print(twitter_archive['text'][902])
print(twitter_archive['text'][1779])

print(twitter_archive['text'][1634])


print(twitter_archive['text'][979])
print(twitter_archive['text'][1635])
print(twitter_archive['text'][1120])
print(twitter_archive['text'][315])
print(twitter_archive['text'][1016])

@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research
@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10
@markhoppus 182/10
@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho
*************************
Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE
IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq
Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3
This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh
Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55
Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv
When you're so b

In [18]:
twitter_archive.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [19]:
print(twitter_archive.loc[twitter_archive.rating_denominator == 110, 'text']) 
print(twitter_archive.loc[twitter_archive.rating_denominator == 120, 'text']) 
print(twitter_archive.loc[twitter_archive.rating_denominator == 130, 'text']) 
print(twitter_archive.loc[twitter_archive.rating_denominator == 150, 'text'])
print(twitter_archive.loc[twitter_archive.rating_denominator == 170, 'text'])
print(twitter_archive.loc[twitter_archive.rating_denominator == 0, 'text'])

1635    Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55
Name: text, dtype: object
1779    IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq
Name: text, dtype: object
1634    Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3
Name: text, dtype: object
902    Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE
Name: text, dtype: object
1120    Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv
Name: text, dtype: object
313    @jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho
Name: text, dtype: object


In [20]:
# Print the whole text to verify denominators

#retweet - it will be deleted when delete all retweets
print(twitter_archive['text'][1635]) 
#actual rating 14/10 need to change manually
print(twitter_archive['text'][1779]) 
#actual rating 10/10 need to change manually
print(twitter_archive['text'][1634]) 
#actual rating 9/10 need to change manually
print(twitter_archive['text'][902]) 
#tweet to explain rating
print(twitter_archive['text'][1120]) 
# this tweet of 0 denominator will be neglected
print(twitter_archive['text'][313]) 

Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55
IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq
Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3
Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE
Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv
@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho


The tweet with zero denominator will be corrected to be 13/10

In [21]:
twitter_archive.name.value_counts()

None       745
a           55
Charlie     12
Oliver      11
Lucy        11
          ... 
Ambrose      1
Buckley      1
Anna         1
Gunner       1
Dex          1
Name: name, Length: 957, dtype: int64

In [22]:
# Check for duplicate
twitter_archive[twitter_archive.tweet_id.duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


#### Image prediction Assessment

In [23]:
image_prediction

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [24]:
image_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [25]:
# Check for duplicate
image_prediction[image_prediction.tweet_id.duplicated()]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog


#### Tweet json Assessment

In [26]:
tweet_json

Unnamed: 0,tweet_id,retweet_count,favorite_count,followers_count,friends_count,source,retweeted_status,url
0,892420643555336193,8853,39467,3200889,104,Twitter for iPhone,Original tweet,https://t.co/MgUWQ76dJU
1,892177421306343426,6514,33819,3200889,104,Twitter for iPhone,Original tweet,https://t.co/0Xxu71qeIV
2,891815181378084864,4328,25461,3200889,104,Twitter for iPhone,Original tweet,https://t.co/wUnZnhtVJB
3,891689557279858688,8964,42908,3200889,104,Twitter for iPhone,Original tweet,https://t.co/tD36da7qLQ
4,891327558926688256,9774,41048,3200889,104,Twitter for iPhone,Original tweet,https://t.co/AtUZn91f7f
...,...,...,...,...,...,...,...,...
2349,666049248165822465,41,111,3201018,104,Twitter for iPhone,Original tweet,https://t.co/4B7cOc1EDq
2350,666044226329800704,147,311,3201018,104,Twitter for iPhone,Original tweet,https://t.co/DWnyCjf2mx
2351,666033412701032449,47,128,3201018,104,Twitter for iPhone,Original tweet,https://t.co/y671yMhoiR
2352,666029285002620928,48,132,3201018,104,Twitter for iPhone,Original tweet,https://t.co/r7mOb2m0UI


In [27]:
tweet_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   tweet_id          2354 non-null   int64 
 1   retweet_count     2354 non-null   int64 
 2   favorite_count    2354 non-null   int64 
 3   followers_count   2354 non-null   int64 
 4   friends_count     2354 non-null   int64 
 5   source            2354 non-null   object
 6   retweeted_status  2354 non-null   object
 7   url               2354 non-null   object
dtypes: int64(5), object(3)
memory usage: 147.2+ KB


In [28]:
# Check for duplicate
tweet_json[tweet_json.tweet_id.duplicated()]

Unnamed: 0,tweet_id,retweet_count,favorite_count,followers_count,friends_count,source,retweeted_status,url


### Quality issues

1. Timestamp is not in correct datetime format

2. Drop unnecessary columns (in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp, expanded_urls)

3. Erroneous datatype tweet_id for tweet (combined twitter_archive, tweet_json)

4. Source column should be categorical datatype

5. Wrong name like `a` and `None` (naming issues)

6. Erroneous datatype tweet_id for image_prediction

7. p1_conf, p2_conf and p3_conf is decimal in image prediction table

8. Some name in the p1, p2, p3 columns are capitalize

### Tidiness issues
1. Merge the `twitter archive`, and `tweet json` dataframe

2. Two columns in `twitter archive` table (rating_numerator and rating_denominator) combine to one (ratings)

3. Two variable in the timestamp column (date and time).

4. Create new dog type column with doggo,floofer,pupper,puppo as its values

## Cleaning Data


In [29]:
# Make copies of original pieces of data
twitter_archive_clean = twitter_archive.copy()
image_prediction_clean = image_prediction.copy()
tweet_json_clean = tweet_json.copy()

### Tidiness

### Issue #1: 
Merge the `twitter archive`, and `tweet json` dataframe

#### Define:
Merge the twitter archive and tweet json to a single dataframe

#### Code

In [30]:
tweet_clean = pd.merge(twitter_archive_clean, tweet_json_clean, on='tweet_id', how='inner')

#### Test

In [31]:
tweet_clean.sample(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source_x,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,floofer,pupper,puppo,retweet_count,favorite_count,followers_count,friends_count,source_y,retweeted_status,url
43,883838122936631299,,,2017-07-09 00:00:04 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Noah. He can't believe someone made this mess. Got the vacuum out for you though. Offered to help clean pup. 12/10 super good boy https://t.co/V85xujjDDY,,,,https://twitter.com/dog_rates/status/883838122936631299/photo/1,...,,,,3586,22349,3200890,104,Twitter for iPhone,Original tweet,https://t.co/V85xujjDDY
526,808733504066486276,,,2016-12-13 18:01:07 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here's a pupper in a onesie. Quite pupset about it. Currently plotting revenge. 12/10 would rescue https://t.co/xQfrbNK3HD,,,,https://twitter.com/dog_rates/status/808733504066486276/photo/1,...,,pupper,,2509,8784,3200896,104,Twitter for iPhone,Original tweet,https://t.co/xQfrbNK3HD
1047,743222593470234624,,,2016-06-15 23:24:09 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a very rare Great Alaskan Bush Pupper. Hard to stumble upon without spooking. 12/10 would pet passionately https://t.co/xOBKCdpzaa,,,,https://twitter.com/dog_rates/status/743222593470234624/photo/1,...,,pupper,,2164,6792,3200943,104,Twitter for iPhone,Original tweet,https://t.co/xOBKCdpzaa


In [32]:
tweet_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source_x', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo',
       'retweet_count', 'favorite_count', 'followers_count', 'friends_count',
       'source_y', 'retweeted_status', 'url'],
      dtype='object')

### Issue #2: 
Two columns in `twitter archive` table (rating_numerator and rating_denominator) combine to one (ratings)

#### Define:
create a new rating column from rating_numerator and rating_denominator, and drop the rating_numerator and rating_denominator

#### Code

In [33]:
tweet_clean['rating'] =  (tweet_clean.rating_numerator / tweet_clean.rating_denominator)

In [34]:
tweet_clean = tweet_clean.drop(['rating_numerator', 'rating_denominator'], axis=1)

#### Test

In [35]:
tweet_clean.sample(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source_x,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,pupper,puppo,retweet_count,favorite_count,followers_count,friends_count,source_y,retweeted_status,url,rating
108,871166179821445120,,,2017-06-04 00:46:17 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",RT @dog_rates: This is Dawn. She's just checking pup on you. Making sure you're doing okay. 12/10 she's here if you need her https://t.co/X…,8.41077e+17,4196984000.0,2017-03-13 00:02:39 +0000,https://twitter.com/dog_rates/status/841077006473256960/photo/1,...,,,5991,0,3200891,104,Twitter for iPhone,Original tweet,https://t.co/X…,1.2
1104,734787690684657664,,,2016-05-23 16:46:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This dog is more successful than I will ever be. 13/10 absolute legend https://t.co/BPoaHySYwA,,,,"https://twitter.com/dog_rates/status/734787690684657664/photo/1,https://twitter.com/dog_rates/status/734787690684657664/photo/1,https://twitter.com/dog_rates/status/734787690684657664/photo/1,https://twitter.com/dog_rates/status/734787690684657664/photo/1",...,,,7102,13745,3200944,104,Twitter for iPhone,Original tweet,https://t.co/BPoaHySYwA,1.3
1643,683849932751646720,,,2016-01-04 03:18:23 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Jiminy. He's not the brightest dog. Needs to lay off the kibble. 5/10 still petable https://t.co/omln4LOy1x,,,,https://twitter.com/dog_rates/status/683849932751646720/photo/1,...,,,1116,2896,3200860,104,Twitter for iPhone,Original tweet,https://t.co/omln4LOy1x,0.5


### Issue #3: 
Two variable in the timestamp column (date and time).


#### Define:
Extract the date from the timestamp column, and drop the timestamp column.

#### Code

In [36]:
tweet_clean['date'] = pd.to_datetime(tweet_clean['timestamp']).dt.date

In [37]:
tweet_clean = tweet_clean.drop('timestamp', axis=1)

#### Test

In [38]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2354 entries, 0 to 2353
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2354 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   source_x                    2354 non-null   object 
 4   text                        2354 non-null   object 
 5   retweeted_status_id         179 non-null    float64
 6   retweeted_status_user_id    179 non-null    float64
 7   retweeted_status_timestamp  179 non-null    object 
 8   expanded_urls               2295 non-null   object 
 9   name                        2354 non-null   object 
 10  doggo                       2354 non-null   object 
 11  floofer                     2354 non-null   object 
 12  pupper                      2354 non-null   object 
 13  puppo                       2354 

### Issue #4:
Combine four columns (doggo,floofer,pupper,puppo) into one (dog type) `twitter archive`

#### Define:
Melt the doggo, floofer, pupper, and puppo columns to a dog type column

#### Code

In [39]:
# Create dog type column.
tweet_clean['dog_type'] = tweet_clean.text.str.extract('(doggo|floofer|pupper|puppo)')

#### Test

In [40]:
tweet_clean.sample(9)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,source_x,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,name,...,retweet_count,favorite_count,followers_count,friends_count,source_y,retweeted_status,url,rating,date,dog_type
1200,716439118184652801,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq,,,,https://twitter.com/dog_rates/status/716439118184652801/photo/1,Bluebert,...,247,2574,3200945,104,Twitter for iPhone,Original tweet,https://t.co/Kky1DPG4iq,1.0,2016-04-03,
1642,683852578183077888,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","Say hello to Tiger. He's a penbroke (little dog pun for ya, no need to applaud I know it was good) 10/10 good dog https://t.co/Yei0HzS3JN",,,,https://twitter.com/dog_rates/status/683852578183077888/photo/1,Tiger,...,397,2111,3200952,104,Twitter for iPhone,Original tweet,https://t.co/Yei0HzS3JN,1.0,2016-01-04,
406,823699002998870016,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Winston. The goggles make him a superhero. Protects the entire city from criminals unless they rub his belly really well. 12/10 https://t.co/yCydYURYEL,,,,https://twitter.com/dog_rates/status/823699002998870016/photo/1,Winston,...,2772,13826,3200894,104,Twitter for iPhone,Original tweet,https://t.co/yCydYURYEL,1.2,2017-01-24,
155,861383897657036800,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Hobbes. He's never seen bubbles before. 13/10 deep breaths buddy https://t.co/QFRlbZw4Z1,,,,https://twitter.com/dog_rates/status/861383897657036800/photo/1,Hobbes,...,11528,37744,3200891,104,Twitter for iPhone,Original tweet,https://t.co/QFRlbZw4Z1,1.3,2017-05-08,
443,819238181065359361,,,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",Some happy pupper news to share. 10/10 for everyone involved \nhttps://t.co/MefMAZX2uv,,,,http://us.blastingnews.com/news/2017/01/200-dogs-saved-from-south-korean-dog-meat-industry-001385441.html?sbdht=_pM1QUzk3wsfscF9XF2WEd9KoWDpsQlMUjfh1HxxUq0u5mMbiu2B0kw2_,,...,462,2550,3200894,104,Twitter Web Client,Original tweet,https://t.co/MefMAZX2uv,1.0,2017-01-11,pupper
2259,667549055577362432,,,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",Never seen dog like this. Breathes heavy. Tilts head in a pattern. No bark. Shitty at fetch. Not even cordless. 1/10 https://t.co/i9iSGNn3fx,,,,https://twitter.com/dog_rates/status/667549055577362432/photo/1,,...,2454,6138,3201016,104,Twitter Web Client,Original tweet,https://t.co/i9iSGNn3fx,0.1,2015-11-20,
1886,674781762103414784,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Bedazzled pup here. Fashionable af. Super yellow. Looks hella fluffy. Webbed paws for efficient fetching. 8/10 https://t.co/ot8yMUGodj,,,,https://twitter.com/dog_rates/status/674781762103414784/photo/1,,...,1335,2169,3201004,104,Twitter for iPhone,Original tweet,https://t.co/ot8yMUGodj,0.8,2015-12-10,
1708,680583894916304897,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Penny. Her tennis ball slowly rolled down her cone and into the pool. 8/10 bad things happen to good puppers https://t.co/YNWU7LeFgg,,,,"https://twitter.com/dog_rates/status/680583894916304897/photo/1,https://twitter.com/dog_rates/status/680583894916304897/photo/1,https://twitter.com/dog_rates/status/680583894916304897/photo/1,https://twitter.com/dog_rates/status/680583894916304897/photo/1",Penny,...,1514,3939,3200952,104,Twitter for iPhone,Original tweet,https://t.co/YNWU7LeFgg,0.8,2015-12-26,pupper
248,845306882940190720,,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Pickles. She's a silly pupper. Thinks she's a dish. 12/10 would dry https://t.co/7mPCF4ZwEk,,,,https://twitter.com/dog_rates/status/845306882940190720/photo/1,Pickles,...,6039,25225,3200891,104,Twitter for iPhone,Original tweet,https://t.co/7mPCF4ZwEk,1.2,2017-03-24,pupper


In [41]:
tweet_clean.dog_type.value_counts()

pupper     264
doggo       93
puppo       37
floofer      4
Name: dog_type, dtype: int64

In [42]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2354 entries, 0 to 2353
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2354 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   source_x                    2354 non-null   object 
 4   text                        2354 non-null   object 
 5   retweeted_status_id         179 non-null    float64
 6   retweeted_status_user_id    179 non-null    float64
 7   retweeted_status_timestamp  179 non-null    object 
 8   expanded_urls               2295 non-null   object 
 9   name                        2354 non-null   object 
 10  doggo                       2354 non-null   object 
 11  floofer                     2354 non-null   object 
 12  pupper                      2354 non-null   object 
 13  puppo                       2354 

### Quality

### Issue #1:
Timestamp(date) is not in correct datetime format

#### Define:
Convert the extracted date columns to datetime format.

#### Code

In [43]:
tweet_clean.date = pd.to_datetime(tweet_clean.date)

#### Test

In [44]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2354 entries, 0 to 2353
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2354 non-null   int64         
 1   in_reply_to_status_id       78 non-null     float64       
 2   in_reply_to_user_id         78 non-null     float64       
 3   source_x                    2354 non-null   object        
 4   text                        2354 non-null   object        
 5   retweeted_status_id         179 non-null    float64       
 6   retweeted_status_user_id    179 non-null    float64       
 7   retweeted_status_timestamp  179 non-null    object        
 8   expanded_urls               2295 non-null   object        
 9   name                        2354 non-null   object        
 10  doggo                       2354 non-null   object        
 11  floofer                     2354 non-null   object      

### Issue #2:
Drop unnecessary columns (in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp, expanded_urls, source_x)

#### Define:
Drop the in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp, expanded_urls, source_x columns and rename the other source_y to source.

#### Code

In [45]:
tweet_clean = tweet_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id',
                                'retweeted_status_user_id', 'retweeted_status_timestamp', 'expanded_urls', 
                                'source_x'], axis=1)

In [46]:
# Rename the source_y to column.
tweet_clean = tweet_clean.rename(columns={'source_y': 'source'})

#### Test

In [47]:
tweet_clean.columns

Index(['tweet_id', 'text', 'name', 'doggo', 'floofer', 'pupper', 'puppo',
       'retweet_count', 'favorite_count', 'followers_count', 'friends_count',
       'source', 'retweeted_status', 'url', 'rating', 'date', 'dog_type'],
      dtype='object')

### Issue #3:
Erroneous datatype tweet_id for tweet (combined twitter_archive, tweet_json)

#### Define:
Change tweet_id to object for the tweet table(combined twitter archive and tweet json)

#### Code

In [48]:
# Change tweet table tweet id to object
tweet_clean.tweet_id = tweet_clean.tweet_id.astype(object)

#### Test

In [49]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2354 entries, 0 to 2353
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   tweet_id          2354 non-null   object        
 1   text              2354 non-null   object        
 2   name              2354 non-null   object        
 3   doggo             2354 non-null   object        
 4   floofer           2354 non-null   object        
 5   pupper            2354 non-null   object        
 6   puppo             2354 non-null   object        
 7   retweet_count     2354 non-null   int64         
 8   favorite_count    2354 non-null   int64         
 9   followers_count   2354 non-null   int64         
 10  friends_count     2354 non-null   int64         
 11  source            2354 non-null   object        
 12  retweeted_status  2354 non-null   object        
 13  url               2354 non-null   object        
 14  rating            2354 n

### Issue #4:
Source column should be categorical datatype

#### Define:
Convert source column to categorical datatype.

#### Code

In [50]:
tweet_clean.source = tweet_clean.source.astype('category')

#### Test

In [51]:
tweet_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2354 entries, 0 to 2353
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   tweet_id          2354 non-null   object        
 1   text              2354 non-null   object        
 2   name              2354 non-null   object        
 3   doggo             2354 non-null   object        
 4   floofer           2354 non-null   object        
 5   pupper            2354 non-null   object        
 6   puppo             2354 non-null   object        
 7   retweet_count     2354 non-null   int64         
 8   favorite_count    2354 non-null   int64         
 9   followers_count   2354 non-null   int64         
 10  friends_count     2354 non-null   int64         
 11  source            2354 non-null   category      
 12  retweeted_status  2354 non-null   object        
 13  url               2354 non-null   object        
 14  rating            2354 n

### Issue #5:
wrong name like `a` and `None` (naming issues)

#### Define:
Drop names like `a` and `none`.

#### Code

In [52]:
# Drop rows that have `a` in the name column
tweet_clean.name = tweet_clean.name[tweet_clean.name != 'a']

In [53]:
# Drop rows that have `None` in the name column
tweet_clean.name = tweet_clean.name[tweet_clean.name != 'None']

#### Test

In [54]:
tweet_clean.name.value_counts()

Charlie     12
Cooper      11
Oliver      11
Lucy        11
Lola        10
            ..
Dietrich     1
Milky        1
Ole          1
Buckley      1
Dex          1
Name: name, Length: 955, dtype: int64

### Issue #6:
Erroneous datatype tweet_id for image_prediction

#### Define:
Change tweet_id to object for image prediction

#### Code

In [55]:
# Change image prediction table tweet id to object
image_prediction_clean.tweet_id = image_prediction_clean.tweet_id.astype(object)

#### Test

In [56]:
image_prediction_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   object 
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(1), object(5)
memory usage: 152.1+ KB


### Issue #7:
p1_conf, p2_conf and p3_conf is decimal in image prediction table

#### Define:
Change the p1_conf, p2_conf and p3_conf columns to percentage 

#### Code

In [57]:
# Using applu, multiplying 100 to each rows in the columns.
image_prediction_clean.p1_conf = image_prediction_clean.p1_conf.apply(lambda x: round(x * 100, 2))
image_prediction_clean.p2_conf = image_prediction_clean.p2_conf.apply(lambda x: round(x * 100, 2))
image_prediction_clean.p3_conf = image_prediction_clean.p3_conf.apply(lambda x: round(x * 100, 2))

#### Test

In [58]:
image_prediction_clean.sample(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
788,690400367696297985,https://pbs.twimg.com/media/CZTLeBuWIAAFkeR.jpg,1,Pembroke,42.65,True,papillon,31.74,True,Shetland_sheepdog,7.76,True
1066,715696743237730304,https://pbs.twimg.com/media/Ce6qZC2WAAAcSoI.jpg,1,Staffordshire_bullterrier,42.78,True,pug,22.14,True,French_bulldog,13.21,True
1943,861383897657036800,https://pbs.twimg.com/media/C_RAFTxUAAAbXjV.jpg,1,Cardigan,77.1,True,Pembroke,13.72,True,French_bulldog,6.33,True


### Issue #8:
Some name in the p1, p2, p3 columns are capitalize

#### Define:
Change all name first letter in the p1, p2, p3 columns to capital

#### Code

In [59]:
# Capitalize using the str.title method.
image_prediction_clean.p1 = image_prediction_clean.p1.str.title()
image_prediction_clean.p2 = image_prediction_clean.p2.str.title()
image_prediction_clean.p3 = image_prediction_clean.p3.str.title()

#### Test

In [60]:
image_prediction.sample(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1274,750041628174217216,https://pbs.twimg.com/media/CmfssOtXYAAKa_Z.jpg,1,Labrador_retriever,0.252031,True,Maltese_dog,0.18809,True,golden_retriever,0.133017,True
525,676606785097199616,https://pbs.twimg.com/media/CWPKSGpWcAQN6mw.jpg,1,Loafer,0.202999,False,doormat,0.200411,False,malinois,0.1423,True
978,707059547140169728,https://pbs.twimg.com/media/Cc_64zVWEAAeXs7.jpg,1,Samoyed,0.897312,True,Great_Pyrenees,0.03918,True,kuvasz,0.019516,True


## Storing Data
Save gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".

In [61]:
tweet_clean.to_csv('data/twitter_archive_master.csv', index=False)
image_prediction_clean.to_csv('data/image_prediction_master.csv', index=False)

## Analyzing and Visualizing Data
In this section, analyze and visualize your wrangled data. You must produce at least **three (3) insights and one (1) visualization.**

In [62]:
# read the cleaned twitter archive files for analysis.
twitter = pd.read_csv('data/twitter_archive_master.csv')
image_prediction = pd.read_csv('data/image_prediction_master.csv')

In [63]:
twitter.head()

Unnamed: 0,tweet_id,text,name,doggo,floofer,pupper,puppo,retweet_count,favorite_count,followers_count,friends_count,source,retweeted_status,url,rating,date,dog_type
0,892420643555336193,This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,Phineas,,,,,8853,39467,3200889,104,Twitter for iPhone,Original tweet,https://t.co/MgUWQ76dJU,1.3,2017-08-01,
1,892177421306343426,"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",Tilly,,,,,6514,33819,3200889,104,Twitter for iPhone,Original tweet,https://t.co/0Xxu71qeIV,1.3,2017-08-01,
2,891815181378084864,This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,Archie,,,,,4328,25461,3200889,104,Twitter for iPhone,Original tweet,https://t.co/wUnZnhtVJB,1.2,2017-07-31,
3,891689557279858688,This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,Darla,,,,,8964,42908,3200889,104,Twitter for iPhone,Original tweet,https://t.co/tD36da7qLQ,1.3,2017-07-30,
4,891327558926688256,"This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",Franklin,,,,,9774,41048,3200889,104,Twitter for iPhone,Original tweet,https://t.co/AtUZn91f7f,1.2,2017-07-29,


In [64]:
image_prediction.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_Springer_Spaniel,46.51,True,Collie,15.67,True,Shetland_Sheepdog,6.14,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,Redbone,50.68,True,Miniature_Pinscher,7.42,True,Rhodesian_Ridgeback,7.2,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_Shepherd,59.65,True,Malinois,13.86,True,Bloodhound,11.62,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_Ridgeback,40.81,True,Redbone,36.07,True,Miniature_Pinscher,22.28,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,Miniature_Pinscher,56.03,True,Rottweiler,24.37,True,Doberman,15.46,True


### Insights:
1.

2.

3.

### Visualization

In [73]:
twitter.dog_type.value_counts()

pupper     264
doggo       93
puppo       37
floofer      4
Name: dog_type, dtype: int64

In [None]:
def create_bar_plot(df, feature):
    fig, ax  = plt.subplots(figsize=(6,10))
    sns.set_theme(style='whitegrid')
    sns.barplot(x=value_cnt_norm_cal(df, feature).index, y=value_cnt_norm_cal(df,feature).values[:,0])
    plt.xlabel('{}'.format(feature))
    plt.ylabel('Count')
    plt.title('{} count'.format(feature), fontsize=20)
    return plt.show()

In [None]:
# creating the bar plot
dog_type = twitter.value_counts().index()
count = twitter.value_counts().values()

plt.bar(dog_type, count, color ='maroon',
        width = 0.4)
 
plt.xlabel("Dog types")
plt.ylabel("Counts of dog type")
plt.title("Dog type")
plt.show()