# Project: Wrangling and Analyze Data

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 50)

## Data Gathering


#### Data source #1

In [2]:
tweet_archive = pd.read_csv("resources/twitter-archive-enhanced.csv")


#### Data source #2

In [3]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

response=requests.get(url)
folder_name = 'resources'

with open(os.path.join(folder_name, url.split('/')[-1]), mode='wb') as file:
    file.write(response.content)

In [4]:
os.listdir(folder_name)

['image-predictions.tsv', 'tweet-json.txt', 'twitter-archive-enhanced.csv']

In [5]:
image_predictions = pd.read_csv(os.path.join(folder_name, 'image-predictions.tsv'), sep='\t')
# image_predictions.to_csv('image_predictions.csv', index=False)

#### Data source #3

In [6]:
tweet_list = []
with open(os.path.join(folder_name, 'tweet-json.txt'), mode='r') as file:
    for line in file:
        data=json.loads(line)
        tweet_list.append(data)
additional_data=pd.DataFrame(tweet_list, columns=['id', 'retweet_count', 'favorite_count', 'quoted_status_id'])
additional_data.sample()
# pd.DataFrame(tweet_list).to_csv('additional_data.csv', index=False)

Unnamed: 0,id,retweet_count,favorite_count,quoted_status_id
1860,675432746517426176,627,1623,


## Assessing Data


In [7]:
print(tweet_archive.shape)
print(tweet_archive.info())
tweet_archive.head(10)

(2356, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                 

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,,,https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,,,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,,,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,,,"https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek https://t.co/kQ04fDDRmh,,,,https://twitter.com/dog_rates/status/891087950875897856/photo/1,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Meet Jax. He enjoys ice cream so much he gets nervous around it. 13/10 help Jax enjoy more things by clicking below\n\nhttps://t.co/Zr4hWfAs1H https://t.co/tVJBRMnhxl,,,,"https://gofundme.com/ydvmve-surgery-for-jax,https://twitter.com/dog_rates/status/890971913173991426/photo/1",13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",When you watch your owner call another dog a good boy but then they turn back to you and say you're a great boy. 13/10 https://t.co/v0nONBcwxq,,,,"https://twitter.com/dog_rates/status/890729181411237888/photo/1,https://twitter.com/dog_rates/status/890729181411237888/photo/1",13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Zoey. She doesn't want to be one of the scary sharks. Just wants to be a snuggly pettable boatpet. 13/10 #BarkWeek https://t.co/9TwLuAGH0b,,,,https://twitter.com/dog_rates/status/890609185150312448/photo/1,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Cassie. She is a college pup. Studying international doggo communication and stick theory. 14/10 so elegant much sophisticate https://t.co/t1bfwz5S2A,,,,https://twitter.com/dog_rates/status/890240255349198849/photo/1,14,10,Cassie,doggo,,,


In [8]:
print(image_predictions.shape)
print(image_predictions.info())
image_predictions.head(10)

(2075, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
None


Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
5,666050758794694657,https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg,1,Bernese_mountain_dog,0.651137,True,English_springer,0.263788,True,Greater_Swiss_Mountain_dog,0.016199,True
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,0.045885,False,terrapin,0.017885,False
7,666055525042405380,https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg,1,chow,0.692517,True,Tibetan_mastiff,0.058279,True,fur_coat,0.054449,False
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
9,666058600524156928,https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg,1,miniature_poodle,0.201493,True,komondor,0.192305,True,soft-coated_wheaten_terrier,0.082086,True


In [9]:
print(additional_data.shape)
print(additional_data.info())
additional_data.head(10)

(2354, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2354 non-null   int64  
 1   retweet_count     2354 non-null   int64  
 2   favorite_count    2354 non-null   int64  
 3   quoted_status_id  29 non-null     float64
dtypes: float64(1), int64(3)
memory usage: 73.7 KB
None


Unnamed: 0,id,retweet_count,favorite_count,quoted_status_id
0,892420643555336193,8853,39467,
1,892177421306343426,6514,33819,
2,891815181378084864,4328,25461,
3,891689557279858688,8964,42908,
4,891327558926688256,9774,41048,
5,891087950875897856,3261,20562,
6,890971913173991426,2158,12041,
7,890729181411237888,16716,56848,
8,890609185150312448,4429,28226,
9,890240255349198849,7711,32467,


In [10]:
tweet_archive.name.value_counts()[:15]

None       745
a           55
Charlie     12
Cooper      11
Lucy        11
Oliver      11
Tucker      10
Penny       10
Lola        10
Winston      9
Bo           9
Sadie        8
the          8
Daisy        7
Buddy        7
Name: name, dtype: int64

In [11]:
tweet_archive.name.unique()

array(['Phineas', 'Tilly', 'Archie', 'Darla', 'Franklin', 'None', 'Jax',
       'Zoey', 'Cassie', 'Koda', 'Bruno', 'Ted', 'Stuart', 'Oliver',
       'Jim', 'Zeke', 'Ralphus', 'Canela', 'Gerald', 'Jeffrey', 'such',
       'Maya', 'Mingus', 'Derek', 'Roscoe', 'Waffles', 'Jimbo', 'Maisey',
       'Lilly', 'Earl', 'Lola', 'Kevin', 'Yogi', 'Noah', 'Bella',
       'Grizzwald', 'Rusty', 'Gus', 'Stanley', 'Alfy', 'Koko', 'Rey',
       'Gary', 'a', 'Elliot', 'Louis', 'Jesse', 'Romeo', 'Bailey',
       'Duddles', 'Jack', 'Emmy', 'Steven', 'Beau', 'Snoopy', 'Shadow',
       'Terrance', 'Aja', 'Penny', 'Dante', 'Nelly', 'Ginger', 'Benedict',
       'Venti', 'Goose', 'Nugget', 'Cash', 'Coco', 'Jed', 'Sebastian',
       'Walter', 'Sierra', 'Monkey', 'Harry', 'Kody', 'Lassie', 'Rover',
       'Napolean', 'Dawn', 'Boomer', 'Cody', 'Rumble', 'Clifford',
       'quite', 'Dewey', 'Scout', 'Gizmo', 'Cooper', 'Harold', 'Shikha',
       'Jamesy', 'Lili', 'Sammy', 'Meatball', 'Paisley', 'Albus',
       'Nept

In [12]:
not_name=[]
for name in tweet_archive.name.unique():
    if name[0].isupper()==False:
        not_name.append(name)

not_name

['such',
 'a',
 'quite',
 'not',
 'one',
 'incredibly',
 'mad',
 'an',
 'very',
 'just',
 'my',
 'his',
 'actually',
 'getting',
 'this',
 'unacceptable',
 'all',
 'old',
 'infuriating',
 'the',
 'by',
 'officially',
 'life',
 'light',
 'space']

In [13]:
upper_name=[]
for name in tweet_archive.name.unique():
    if name.isupper()==True:
        upper_name.append(name)

upper_name

['O', 'JD']

In [14]:
tweet_archive[tweet_archive.name=='O']

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
775,776201521193218049,,,2016-09-14 23:30:38 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is O'Malley. That is how he sleeps. Doesn't care what you think about it. 10/10 comfy af https://t.co/Pq150LeRaC,,,,https://twitter.com/dog_rates/status/776201521193218049/photo/1,10,10,O,,,,


In [15]:
tweet_archive[tweet_archive.name=='JD']

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2041,671542985629241344,,,2015-12-01 04:14:59 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is JD (stands for ""just dog""). He's like Airbud but with trading card games instead of sports. 10/10 much skill https://t.co/zzueJV9jCF",,,,https://twitter.com/dog_rates/status/671542985629241344/photo/1,10,10,JD,,,,


In [16]:
all_columns = pd.Series(list(tweet_archive) + list(image_predictions) + list(additional_data))
all_columns[all_columns.duplicated()]

17    tweet_id
dtype: object

### Tidiness issues
1. All 3 tables are based on the same entity, therefore they should be merged to 1 table.

2. Dog stages spread across 4 different columns (`doggo`, `floofer`, `pupper`, `puppo`) 

3. Actual tweet `text` column contains a URL that should be on another column.

4. Dog breed and prediction (True/False) not in one Column

### Quality issues 
1. Table contains rows not required in analysis (rows indicating retweets, replies and quotes).

2. Some tweets are not dog ratings.

3.  Tweet `source` cloumn contains HTML script.

4. Dog stages representing `null` values as a string `None`.

5. The `rating_denominator` column contains values not 10.

6. The `rating_numerator` column contains values that are not consistent with the usual ratings.

7. Certain dog `name` values misrepresented. Also, `null` values misrepresented a string `None`.

8. Dog named "O" instead of "O'Malley".

9. Table contains columns not required in analysis.

10. Incorrect data types on some columns (id, twitter_id, timestamp, retweet_count, favorite_count).

## Cleaning Data


In [22]:
# Make copies of original pieces of data
tweet_archive_copy = tweet_archive.copy()
image_predictions_copy = image_predictions.copy()
additional_data_copy = additional_data.copy()

### Issue #1: All 3 tables are based on the same entity, therefore they should be merged to 1 table

#### Define: 
Merge the 3 tables (`tweet_archive_copy`, `image_predictions_copy`, `additional_data_copy`) into one table named `twitter_archive_merged`

#### Code

In [23]:
twitter_archive_merged = tweet_archive_copy.merge(image_predictions_copy, how='left', on='tweet_id'
                        ).merge(additional_data_copy, how='left', left_on='tweet_id', right_on='id')

In [24]:
print(len(twitter_archive_merged.columns))
len(all_columns) # 'tweet_id' column is duplicated here

32


33

#### Test

In [25]:
print(twitter_archive_merged.shape)
print(twitter_archive_merged.columns)
twitter_archive_merged.head()

(2356, 32)
Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo',
       'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2', 'p2_conf',
       'p2_dog', 'p3', 'p3_conf', 'p3_dog', 'id', 'retweet_count',
       'favorite_count', 'quoted_status_id'],
      dtype='object')


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,id,retweet_count,favorite_count,quoted_status_id
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,,,,,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1.0,orange,0.097049,False,bagel,0.085851,False,banana,0.07611,False,8.924206e+17,8853.0,39467.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,,,https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,,,,,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1.0,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True,8.921774e+17,6514.0,33819.0,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,,,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,,,,,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1.0,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True,8.918152e+17,4328.0,25461.0,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,,,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,,,,,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1.0,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False,8.916896e+17,8964.0,42908.0,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,,,"https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,,,,,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2.0,basset,0.555712,True,English_springer,0.22577,True,German_short-haired_pointer,0.175219,True,8.913276e+17,9774.0,41048.0,


### Issue #2: Dog stages spread across 4 different columns (`doggo`, `floofer`, `pupper`, `puppo`) 

#### Define
Join all 4 columns to form a cloumn of `dog_stages`

#### Code

In [26]:
dog_stages=twitter_archive_merged[['tweet_id', 'doggo', 'floofer', 'pupper', 'puppo']]
a=dog_stages.replace('None', '')
twitter_archive_merged['dog_stages']=a.doggo +'-'+ a.floofer +'-'+ a.pupper +'-'+ a.puppo
twitter_archive_merged['dog_stages']=twitter_archive_merged['dog_stages'].replace({'---': np.nan
                                        ,'doggo---':'doggo'
                                        ,'--pupper-':'pupper'
                                        ,'---puppo':'puppo'
                                        ,'-floofer--':'floofer'
                                        ,'doggo---puppo':'doggo-puppo'
                                        ,'doggo-floofer--':'doggo-floofer'
                                        ,'doggo--pupper-':'doggo-pupper'})

#### Test

In [27]:
print(twitter_archive_merged.info())
twitter_archive_merged.sample(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,id,retweet_count,favorite_count,quoted_status_id,dog_stages
80,877316821321428993,,,2017-06-21 00:06:44 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","Meet Dante. At first he wasn't a fan of his new raincoat, then he saw his reflection. H*ckin handsome. 13/10 for water resistant good boy https://t.co/SHRTIo5pxc",,,,"https://twitter.com/dog_rates/status/877316821321428993/photo/1,https://twitter.com/dog_rates/status/877316821321428993/photo/1",13,10,Dante,,,,,https://pbs.twimg.com/media/DCza_vtXkAQXGpC.jpg,1.0,Saluki,0.509967,True,Italian_greyhound,0.090497,True,golden_retriever,0.079406,True,8.773168e+17,5414.0,27907.0,,
1541,689623661272240129,,,2016-01-20 01:41:08 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Lucy. She's terrified of the stuffed billed dog. 10/10 stay strong pupper https://t.co/QnvSjjyh7n,,,,"https://twitter.com/dog_rates/status/689623661272240129/photo/1,https://twitter.com/dog_rates/status/689623661272240129/photo/1",10,10,Lucy,,,pupper,,https://pbs.twimg.com/media/CZIJD2SWIAMJgNI.jpg,1.0,toy_poodle,0.279604,True,mashed_potato,0.208564,False,Labrador_retriever,0.077481,True,6.896237e+17,748.0,2467.0,,pupper


### Issue #3: Actual tweet `text` column contains a URL that should be on another column

#### Define
Extract the url into a different column `tweet_url`

#### Code

In [28]:
twitter_archive_merged['tweet_url'] = twitter_archive_merged['text'].str.extract('.+\n*(http.+)')

In [49]:
twitter_archive_merged['tweet_text'] = twitter_archive_merged['text'].str.extract('(.+).*http.+')

#### Test

In [51]:
twitter_archive_merged[['tweet_url', 'tweet_text']]

Unnamed: 0,tweet_url,tweet_text
1,https://t.co/0Xxu71qeIV,"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10"
2,https://t.co/wUnZnhtVJB,This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10
3,https://t.co/tD36da7qLQ,This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us
4,https://t.co/AtUZn91f7f,"This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek"
5,https://t.co/kQ04fDDRmh,Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek
...,...,...
2351,https://t.co/4B7cOc1EDq,Here we have a 1949 1st generation vulpix. Enjoys sweat tea and Fox News. Cannot be phased. 5/10
2352,https://t.co/DWnyCjf2mx,This is a purebred Piers Morgan. Loves to Netflix and chill. Always looks like he forgot to unplug the iron. 6/10
2353,https://t.co/y671yMhoiR,Here is a very happy pup. Big fan of well-maintained decks. Just look at that tongue. 9/10 would cuddle af
2354,https://t.co/r7mOb2m0UI,This is a western brown Mitsubishi terrier. Upset about leaf. Actually 2 dogs here. 7/10 would walk the shit out of


### Issue #4: Table contains rows not required in analysis (rows indicating retweets, replies and quotes)

#### Define
Remove rows containing retweets, replies, and quotes

#### Code

In [52]:
twitter_archive_merged=twitter_archive_merged[~(
                                                (twitter_archive_merged['retweeted_status_id'].notnull())
                                              | (twitter_archive_merged['in_reply_to_status_id'].notnull())
                                              | (twitter_archive_merged['quoted_status_id'].notnull())
                                                )
                                             ]
twitter_archive_merged.sample()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,id,retweet_count,favorite_count,quoted_status_id,dog_stages,tweet_url,tweet_text,pred_dog_breed,prediction
670,789986466051088384,,,2016-10-23 00:27:05 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Happy. He's a bathtub reviewer. Seems to be pleased with this one. 12/10 https://t.co/Ln89R4FP7v,,,,https://twitter.com/dog_rates/status/789986466051088384/photo/1,12,10,Happy,,,,,https://pbs.twimg.com/media/CvaYgDOWgAEfjls.jpg,1.0,tub,0.479477,False,bathtub,0.325106,False,golden_retriever,0.07853,True,7.899865e+17,2704.0,10369.0,,,https://t.co/Ln89R4FP7v,This is Happy. He's a bathtub reviewer. Seems to be pleased with this one. 12/10,golden_retriever,True


In [53]:
2356-287

2069

#### Test

In [54]:
twitter_archive_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1666 entries, 1 to 2355
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    1666 non-null   int64  
 1   in_reply_to_status_id       0 non-null      float64
 2   in_reply_to_user_id         0 non-null      float64
 3   timestamp                   1666 non-null   object 
 4   source                      1666 non-null   object 
 5   text                        1666 non-null   object 
 6   retweeted_status_id         0 non-null      float64
 7   retweeted_status_user_id    0 non-null      float64
 8   retweeted_status_timestamp  0 non-null      object 
 9   expanded_urls               1666 non-null   object 
 10  rating_numerator            1666 non-null   int64  
 11  rating_denominator          1666 non-null   int64  
 12  name                        1666 non-null   object 
 13  doggo                       1666 

### Issue #5: Dog breed and prediction (True/False) not in one column

#### Define

- Select the first `True` (from p1 to p3) value of an role and the predicted dog (`pred_dog_breed`) of the first true value using `np.select`

- Default the predicted dog (`pred_dog_breed`) breed where `p1=p2=p3=False` to 'undefined'


#### Code

In [55]:
cond_list = [(twitter_archive_merged['p1_dog']==True)
            ,(twitter_archive_merged['p2_dog']==True)
            ,(twitter_archive_merged['p3_dog']==True)
            ,(twitter_archive_merged['p3_dog'].isnull())
             ]
choice_list = [(twitter_archive_merged['p1']), (twitter_archive_merged['p2']), (twitter_archive_merged['p3']), np.nan]

twitter_archive_merged['pred_dog_breed'] = np.select(condlist=cond_list, choicelist=choice_list, default='undefined')

twitter_archive_merged['prediction'] = np.select(condlist=cond_list, choicelist=cond_list, default=False)

#### Test

In [57]:
twitter_archive_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1666 entries, 1 to 2355
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    1666 non-null   int64  
 1   in_reply_to_status_id       0 non-null      float64
 2   in_reply_to_user_id         0 non-null      float64
 3   timestamp                   1666 non-null   object 
 4   source                      1666 non-null   object 
 5   text                        1666 non-null   object 
 6   retweeted_status_id         0 non-null      float64
 7   retweeted_status_user_id    0 non-null      float64
 8   retweeted_status_timestamp  0 non-null      object 
 9   expanded_urls               1666 non-null   object 
 10  rating_numerator            1666 non-null   int64  
 11  rating_denominator          1666 non-null   int64  
 12  name                        1666 non-null   object 
 13  doggo                       1666 

### Issue #6: Some tweets are not dog ratings

#### Define
- Remove rows where `pred_dog_breed` is `undefined`. There are the rows with columns `p1=p2=p3 = False`
- Remove rows where `pred_dog_breed` is `null`. The is done to improve the quality of the data

#### Code

In [58]:
# A quick observation
q=twitter_archive_merged[twitter_archive_merged['prediction']==False]
q

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,id,retweet_count,favorite_count,quoted_status_id,dog_stages,tweet_url,tweet_text,pred_dog_breed,prediction


From the `q` dataframe above, it was observed visually that some images were erroneously ruled False (i.e., as 'not dogs') in all 3 predictions.
Neververtheless, these rows will be dropped from the dataframe in other to improve the quality of the data.

In [59]:
twitter_archive_merged['pred_dog_breed'].value_counts()

golden_retriever      156
Labrador_retriever    106
Pembroke               94
Chihuahua              90
pug                    62
                     ... 
EntleBucher             1
Japanese_spaniel        1
standard_schnauzer      1
silky_terrier           1
Irish_wolfhound         1
Name: pred_dog_breed, Length: 113, dtype: int64

In [60]:
twitter_archive_merged = twitter_archive_merged[   (twitter_archive_merged['pred_dog_breed']!='undefined') 
                                                & ~(twitter_archive_merged['pred_dog_breed'].isnull())]

#### Test

In [61]:
twitter_archive_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1666 entries, 1 to 2355
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    1666 non-null   int64  
 1   in_reply_to_status_id       0 non-null      float64
 2   in_reply_to_user_id         0 non-null      float64
 3   timestamp                   1666 non-null   object 
 4   source                      1666 non-null   object 
 5   text                        1666 non-null   object 
 6   retweeted_status_id         0 non-null      float64
 7   retweeted_status_user_id    0 non-null      float64
 8   retweeted_status_timestamp  0 non-null      object 
 9   expanded_urls               1666 non-null   object 
 10  rating_numerator            1666 non-null   int64  
 11  rating_denominator          1666 non-null   int64  
 12  name                        1666 non-null   object 
 13  doggo                       1666 

### Issue #7: Tweet source cloumn contains HTML script

#### Define
Extract the content of the HTML tag that represents the actual tweet `source`

#### Code

In [77]:
twitter_archive_merged['source'] = twitter_archive_merged['source'].str.extract('.*>(.+)<.*')

#### Test

In [80]:
twitter_archive_merged['source'].value_counts()

Twitter for iPhone    1635
Twitter Web Client      22
TweetDeck                9
Name: source, dtype: int64

### Issue #8: Dog stages representing `null` values as a string `None`

#### Define
This has been addressed as part of Issue #2

#### Code

#### Test

In [82]:
twitter_archive_merged['dog_stages'].value_counts()

pupper           166
doggo             54
puppo             21
floofer            7
doggo-pupper       7
doggo-puppo        1
doggo-floofer      1
Name: dog_stages, dtype: int64

### Issue #9: The `rating_denominator` column contains values not 10.

#### Define
Visually inspect the roles with `rating_denominator` is not (`!=`) equal to 10, and make corrections where necessary

Some `rating_numerators` will also be fixed here

#### Code

In [88]:
twitter_archive_merged[twitter_archive_merged['rating_denominator'] != 10].shape

(16, 37)

In [112]:
twitter_archive_merged[twitter_archive_merged['rating_denominator'] != 10
                      ][['tweet_id', 'text', 'rating_numerator', 'rating_denominator']]

Unnamed: 0,tweet_id,text,rating_numerator,rating_denominator
433,820690176645140481,The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd,84,70
902,758467244762497024,Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE,165,150
1228,713900603437621249,Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1,99,90
1254,710658690886586372,Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12,80,80
1274,709198395643068416,"From left to right:\nCletus, Jerome, Alejandro, Burp, &amp; Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK",45,50
1351,704054845121142784,Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa,60,50
1433,697463031882764288,Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ,44,40
1635,684222868335505415,Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55,121,110
1779,677716515794329600,IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq,144,120
1843,675853064436391936,Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw,88,80


From the dataframe above, where `rating_denominator` is not (`!=`) equal to 10, we see that of the 16 values, only 6 actually have their ratings misrepresented.

row with `tweet_id = 810984652412424192` will be removed as it is not a twitter dog rating.

In [97]:
def set_rating(tweet_id, correct_numr, correct_denr, numerator='rating_numerator', denominator='rating_denominator'):
    
    row_index = twitter_archive_merged[twitter_archive_merged['tweet_id']==tweet_id].index
    twitter_archive_merged.loc[row_index, numerator]=correct_numr
    twitter_archive_merged.loc[row_index, denominator]=correct_denr
    
    corrected_row = twitter_archive_merged[twitter_archive_merged['tweet_id']==tweet_id]
    return corrected_row

In [110]:
twitter_archive_merged.drop(twitter_archive_merged[twitter_archive_merged['tweet_id']==810984652412424192].index
                            , axis=0, inplace= True)
set_rating(tweet_id=740373189193256964, correct_numr=14, correct_denr=10)
set_rating(tweet_id=722974582966214656, correct_numr=13, correct_denr=10)
set_rating(tweet_id=716439118184652801, correct_numr=11, correct_denr=10)
set_rating(tweet_id=682962037429899265, correct_numr=10, correct_denr=10)
set_rating(tweet_id=666287406224695296, correct_numr=9, correct_denr=10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,id,retweet_count,favorite_count,quoted_status_id,dog_stages,tweet_url,tweet_text,pred_dog_breed,prediction
2335,666287406224695296,,,2015-11-16 16:11:11 +0000,Twitter for iPhone,This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv,,,,https://twitter.com/dog_rates/status/666287406224695296/photo/1,9,10,an,,,,,https://pbs.twimg.com/media/CT8g3BpUEAAuFjg.jpg,1.0,Maltese_dog,0.857531,True,toy_poodle,0.063064,True,miniature_poodle,0.025581,True,6.662874e+17,71.0,152.0,,,https://t.co/d9NcXFKwLv,This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10,Maltese_dog,True


#### Test

In [113]:
twitter_archive_merged[twitter_archive_merged['rating_denominator'] != 10
                      ][['tweet_id', 'text', 'rating_numerator', 'rating_denominator']]

Unnamed: 0,tweet_id,text,rating_numerator,rating_denominator
433,820690176645140481,The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd,84,70
902,758467244762497024,Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE,165,150
1228,713900603437621249,Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1,99,90
1254,710658690886586372,Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12,80,80
1274,709198395643068416,"From left to right:\nCletus, Jerome, Alejandro, Burp, &amp; Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK",45,50
1351,704054845121142784,Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa,60,50
1433,697463031882764288,Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ,44,40
1635,684222868335505415,Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55,121,110
1779,677716515794329600,IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq,144,120
1843,675853064436391936,Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw,88,80


### Issue #10: The `rating_numerator` column contains values that are not consistent with the usual ratings.

#### Define
The lowest ratings will be visually inspected and appropriate fix will be done

#### Code

In [115]:
twitter_archive_merged['rating_numerator'].unique()

array([ 13,  12,  14,   5,  11,   6,   0,  10,  84,  75,  27,   3,   9,
         8,   7, 165,   4,  99,  80,  45,  60,  44, 121,  26,   2, 144,
        88], dtype=int64)

In [131]:
twitter_archive_merged[twitter_archive_merged['rating_numerator'].isin([0,2,3,4])
                      ][['tweet_id', 'text', 'rating_numerator', 'rating_denominator']]

Unnamed: 0,tweet_id,text,rating_numerator,rating_denominator
315,835152434251116546,When you're so blinded by your systematic plagiarism that you forget what day it is. 0/10 https://t.co/YbEJPkg4Ag,0,10
1004,747816857231626240,Viewer discretion is advised. This is a terrible attack in progress. Not even in water (tragic af). 4/10 bad sherk https://t.co/L3U0j14N5R,4,10
1249,711306686208872448,What hooligan sent in pictures w/out a dog in them? Churlish af. 3/10 just bc that's a neat fluffy bean bag chair https://t.co/wcwoGOkZvz,3,10
1303,707420581654872064,This is Keurig. He's a rare dog. Laughs like an idiot tho. Head is basically a weapon. Poorly maintained goatee 4/10 https://t.co/xOrUyj7K30,4,10
1701,680940246314430465,This is Alice. She's an idiot. 4/10 https://t.co/VQXdwJfkyS,4,10
1764,678424312106393600,This is Crystal. She's a shitty fireman. No sense of urgency. People could be dying Crystal. 2/10 just irresponsible https://t.co/rtMtjSl9pz,2,10
1938,673906403526995968,Guys I'm getting real tired of this. We only rate dogs. Please don't send in other things like this Bulbasaur. 3/10 https://t.co/t5rQHl6W8M,3,10
2070,671122204919246848,Two miniature golden retrievers here. Webbed paws. Don't walk very efficiently. Can't catch a tennis ball. 4/10s https://t.co/WzVLdSHJU7,4,10
2246,667878741721415682,This is Tedrick. He lives on the edge. Needs someone to hit the gas tho. Other than that he's a baller. 10&amp;2/10 https://t.co/LvP1TTYSCN,2,10
2288,667176164155375616,These are strange dogs. All have toupees. Long neck for dogs. In a shed of sorts? Work in groups? 4/10 still petable https://t.co/PZxSarAfSN,4,10


By inspection, some rows above are not dog ratings and will be removed.

In [130]:
twitter_archive_merged.drop([765, 1189, 2183, 1303, 1701, 2070, 2288, 2316], axis=0, inplace= True)

KeyError: '[765, 1189, 2183] not found in axis'

#### Test

In [132]:
twitter_archive_merged[['tweet_id', 'text', 'rating_numerator', 'rating_denominator']]

Unnamed: 0,tweet_id,text,rating_numerator,rating_denominator
1,892177421306343426,"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",13,10
2,891815181378084864,This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,12,10
3,891689557279858688,This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,13,10
4,891327558926688256,"This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",12,10
5,891087950875897856,Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek https://t.co/kQ04fDDRmh,13,10
...,...,...,...,...
2351,666049248165822465,Here we have a 1949 1st generation vulpix. Enjoys sweat tea and Fox News. Cannot be phased. 5/10 https://t.co/4B7cOc1EDq,5,10
2352,666044226329800704,This is a purebred Piers Morgan. Loves to Netflix and chill. Always looks like he forgot to unplug the iron. 6/10 https://t.co/DWnyCjf2mx,6,10
2353,666033412701032449,Here is a very happy pup. Big fan of well-maintained decks. Just look at that tongue. 9/10 would cuddle af https://t.co/y671yMhoiR,9,10
2354,666029285002620928,This is a western brown Mitsubishi terrier. Upset about leaf. Actually 2 dogs here. 7/10 would walk the shit out of https://t.co/r7mOb2m0UI,7,10


### Issue #11: Certain dog `name` values misrepresented. Also, `unknown` values misrepresented a string `None`

#### Define
Set obvious dog name errors to `unknown`. Also set all `None` to `unknown`

#### Code

In [134]:
tweet_archive.name.value_counts()

None          745
a              55
Charlie        12
Cooper         11
Lucy           11
             ... 
Dex             1
Ace             1
Tayzie          1
Grizzie         1
Christoper      1
Name: name, Length: 957, dtype: int64

In [None]:
not_name=[]
for name in tweet_archive.name.unique():
    if name[0].isupper()==False:
        not_name.append(name)

not_name.append('None')

In [146]:
twitter_archive_merged['name'] = twitter_archive_merged['name'].replace(not_name, 'unknown')

#### Test

In [147]:
twitter_archive_merged['name'].value_counts()

unknown       480
Lucy           10
Charlie        10
Cooper         10
Tucker          9
             ... 
Klein           1
DonDon          1
Chevy           1
Philbert        1
Christoper      1
Name: name, Length: 831, dtype: int64

### Issue #12: Dog named "O" instead of "O'Malley"

#### Define
Replace "O" with "O'Malley"

#### Code

In [154]:
row_id = twitter_archive_merged[twitter_archive_merged.name=='O'].index
twitter_archive_merged.loc[row_id, 'name']="O'Malley"


#### Test

In [157]:
twitter_archive_merged[twitter_archive_merged.name=='O']

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,id,retweet_count,favorite_count,quoted_status_id,dog_stages,tweet_url,tweet_text,pred_dog_breed,prediction


In [158]:
twitter_archive_merged[twitter_archive_merged.name=='O\'Malley']

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,id,retweet_count,favorite_count,quoted_status_id,dog_stages,tweet_url,tweet_text,pred_dog_breed,prediction
775,776201521193218049,,,2016-09-14 23:30:38 +0000,Twitter for iPhone,This is O'Malley. That is how he sleeps. Doesn't care what you think about it. 10/10 comfy af https://t.co/Pq150LeRaC,,,,https://twitter.com/dog_rates/status/776201521193218049/photo/1,10,10,O'Malley,,,,,https://pbs.twimg.com/media/CsWfKadWEAAtmlS.jpg,1.0,Rottweiler,0.502228,True,black-and-tan_coonhound,0.154594,True,bloodhound,0.135176,True,7.762015e+17,2919.0,10681.0,,,https://t.co/Pq150LeRaC,This is O'Malley. That is how he sleeps. Doesn't care what you think about it. 10/10 comfy af,Rottweiler,True


### Issue #13: Table contains columns not required in analysis

#### Define
Drop columnns that are not required or important

#### Code

In [161]:
print(twitter_archive_merged.columns)

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo',
       'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2', 'p2_conf',
       'p2_dog', 'p3', 'p3_conf', 'p3_dog', 'id', 'retweet_count',
       'favorite_count', 'quoted_status_id', 'dog_stages', 'tweet_url',
       'tweet_text', 'pred_dog_breed', 'prediction'],
      dtype='object')


In [162]:
drop_col = ['in_reply_to_status_id', 'in_reply_to_user_id', 'text', 'retweeted_status_id', 'retweeted_status_user_id'
            ,'retweeted_status_timestamp', 'expanded_urls', 'doggo', 'floofer', 'pupper', 'puppo','jpg_url'
            ,'p1', 'p1_conf', 'p1_dog', 'p2', 'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog', 'id', 'quoted_status_id']

twitter_archive_master = twitter_archive_merged.drop(columns=drop_col)

#### Test

In [163]:
twitter_archive_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1662 entries, 1 to 2355
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tweet_id            1662 non-null   int64  
 1   timestamp           1662 non-null   object 
 2   source              1662 non-null   object 
 3   rating_numerator    1662 non-null   int64  
 4   rating_denominator  1662 non-null   int64  
 5   name                1662 non-null   object 
 6   img_num             1662 non-null   float64
 7   retweet_count       1662 non-null   float64
 8   favorite_count      1662 non-null   float64
 9   dog_stages          257 non-null    object 
 10  tweet_url           1662 non-null   object 
 11  tweet_text          1662 non-null   object 
 12  pred_dog_breed      1662 non-null   object 
 13  prediction          1662 non-null   bool   
dtypes: bool(1), float64(3), int64(3), object(7)
memory usage: 247.9+ KB


### Issue #14: Incorrect data types on some columns (tweet_id, timestamp, retweet_count, favorite_count, img_num)

#### Define
Set the data types appropriately using `.astype()` method

#### Code

In [182]:
twitter_archive_master['tweet_id']=twitter_archive_master['tweet_id'].astype(str)
twitter_archive_master['timestamp']=pd.to_datetime(twitter_archive_master['timestamp'])
twitter_archive_master['retweet_count']=twitter_archive_master['retweet_count'].astype(int)
twitter_archive_master['favorite_count']=twitter_archive_master['favorite_count'].astype(int)
twitter_archive_master['img_num']=twitter_archive_master['img_num'].astype(int)

#### Test

In [179]:
twitter_archive_master.sample()

Unnamed: 0,tweet_id,timestamp,source,rating_numerator,rating_denominator,name,img_num,retweet_count,favorite_count,dog_stages,tweet_url,tweet_text,pred_dog_breed,prediction
2311,666781792255496192,2015-11-18 00:55:42+00:00,Twitter for iPhone,10,10,unknown,1.0,211,404,,https://t.co/uEvsGLOFHa,This is a purebred Bacardi named Octaviath. Can shoot spaghetti out of mouth. 10/10,Italian_greyhound,True


In [183]:
twitter_archive_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1662 entries, 1 to 2355
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            1662 non-null   object             
 1   timestamp           1662 non-null   datetime64[ns, UTC]
 2   source              1662 non-null   object             
 3   rating_numerator    1662 non-null   int64              
 4   rating_denominator  1662 non-null   int64              
 5   name                1662 non-null   object             
 6   img_num             1662 non-null   int32              
 7   retweet_count       1662 non-null   int32              
 8   favorite_count      1662 non-null   int32              
 9   dog_stages          257 non-null    object             
 10  tweet_url           1662 non-null   object             
 11  tweet_text          1662 non-null   object             
 12  pred_dog_breed      1662 non-null 

## Storing Data
Save gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".

In [185]:
twitter_archive_master.to_csv('twitter_archive_master.csv', index=False)

## Analyzing and Visualizing Data
In this section, analyze and visualize your wrangled data. You must produce at least **three (3) insights and one (1) visualization.**

### Insights:
1.

2.

3.

### Visualization