# Data Cleaning
## Gathering

In [712]:
import pandas as pd
import numpy as np
import os
import requests
import tweepy
import json
import re
from scipy import stats
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from pandas.tools import plotting
from statsmodels.formula.api import ols

In [713]:
pd.set_option('max_colwidth',200)

In [714]:
#load WeRateDogs Twitter data
twitter_WERATEDOGS=pd.read_csv("twitter_archive_enhanced.csv")

In [715]:
#download mechaine learning data
url="https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv"
if not os.path.exists(url.split('/')[-1]):
    response=requests.get(url)
    with open(os.path.join(url.split('/')[-1]),mode='wb') as file:
        file.write(response.content)
        file.close()

In [716]:
image_predictions=pd.read_csv('image-predictions.tsv',sep='\t')

In [717]:
#load tweet data
file_name="tweet_json.txt"
tweet_data=pd.read_json(file_name,lines=True)

In [718]:
tweet_data.head()

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,favorite_count,favorited,full_text,geo,...,possibly_sensitive_appealable,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,truncated,user
0,,,2017-08-01 16:23:56,"[0, 85]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 892420639486877696, 'id_str': '892420639486877696', 'indices': [86, 109], 'media_url': 'http://pbs.twimg.com/media...","{'media': [{'id': 892420639486877696, 'id_str': '892420639486877696', 'indices': [86, 109], 'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg', 'media_url_https': 'https://pbs.twimg.com...",39492,False,This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,...,0.0,,,,8842,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."
1,,,2017-08-01 00:17:27,"[0, 138]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 892177413194625024, 'id_str': '892177413194625024', 'indices': [139, 162], 'media_url': 'http://pbs.twimg.com/medi...","{'media': [{'id': 892177413194625024, 'id_str': '892177413194625024', 'indices': [139, 162], 'media_url': 'http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg', 'media_url_https': 'https://pbs.twimg.co...",33786,False,"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,...,0.0,,,,6480,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."
2,,,2017-07-31 00:18:03,"[0, 121]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 891815175371796480, 'id_str': '891815175371796480', 'indices': [122, 145], 'media_url': 'http://pbs.twimg.com/medi...","{'media': [{'id': 891815175371796480, 'id_str': '891815175371796480', 'indices': [122, 145], 'media_url': 'http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg', 'media_url_https': 'https://pbs.twimg.co...",25445,False,This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,...,0.0,,,,4301,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."
3,,,2017-07-30 15:58:51,"[0, 79]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 891689552724799489, 'id_str': '891689552724799489', 'indices': [80, 103], 'media_url': 'http://pbs.twimg.com/media...","{'media': [{'id': 891689552724799489, 'id_str': '891689552724799489', 'indices': [80, 103], 'media_url': 'http://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg', 'media_url_https': 'https://pbs.twimg.com...",42863,False,This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,...,0.0,,,,8925,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."
4,,,2017-07-29 16:00:24,"[0, 138]","{'hashtags': [{'text': 'BarkWeek', 'indices': [129, 138]}], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 891327551943041024, 'id_str': '891327551943041024', 'indices': [139, 16...","{'media': [{'id': 891327551943041024, 'id_str': '891327551943041024', 'indices': [139, 162], 'media_url': 'http://pbs.twimg.com/media/DF6hr6AVYAAZ8G8.jpg', 'media_url_https': 'https://pbs.twimg.co...",41016,False,"This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,...,0.0,,,,9721,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."


In [719]:
tweet_data['entities'][100]

{'hashtags': [],
 'symbols': [],
 'user_mentions': [{'screen_name': 'loganamnosis',
   'name': 'michael',
   'id': 154767397,
   'id_str': '154767397',
   'indices': [3, 16]},
  {'screen_name': 'dog_rates',
   'name': 'SpookyWeRateDogs™',
   'id': 4196983835,
   'id_str': '4196983835',
   'indices': [73, 83]}],
 'urls': []}

In [720]:
tweet_data['extended_entities'][1]

{'media': [{'id': 892177413194625024,
   'id_str': '892177413194625024',
   'indices': [139, 162],
   'media_url': 'http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
   'media_url_https': 'https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
   'url': 'https://t.co/0Xxu71qeIV',
   'display_url': 'pic.twitter.com/0Xxu71qeIV',
   'expanded_url': 'https://twitter.com/dog_rates/status/892177421306343426/photo/1',
   'type': 'photo',
   'sizes': {'large': {'w': 1407, 'h': 1600, 'resize': 'fit'},
    'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
    'small': {'w': 598, 'h': 680, 'resize': 'fit'},
    'medium': {'w': 1055, 'h': 1200, 'resize': 'fit'}}}]}

In [721]:
print(re.search(r'\d{18}',tweet_data['extended_entities'][1]['media'][0]['expanded_url']).group())

892177421306343426


## ASSESSING

In [722]:
twitter_WERATEDOGS.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,,,https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,,,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,,,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,,,"https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,,,,


In [723]:
twitter_WERATEDOGS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [724]:
twitter_WERATEDOGS[twitter_WERATEDOGS.tweet_id.duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [725]:
twitter_WERATEDOGS.text[5]

"Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek https://t.co/kQ04fDDRmh"

In [726]:
twitter_WERATEDOGS.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [727]:
twitter_WERATEDOGS.name.value_counts()

None            745
a                55
Charlie          12
Oliver           11
Cooper           11
Lucy             11
Lola             10
Penny            10
Tucker           10
Bo                9
Winston           9
the               8
Sadie             8
Buddy             7
Daisy             7
Bailey            7
an                7
Toby              7
Jack              6
Jax               6
Scout             6
Stanley           6
Oscar             6
Milo              6
Leo               6
Koda              6
Bella             6
Dave              6
Rusty             6
Oakley            5
               ... 
Rhino             1
Schnozz           1
Sparky            1
Obi               1
Pumpkin           1
Lulu              1
Rilo              1
Tove              1
Jessifer          1
Birf              1
Goliath           1
unacceptable      1
Batdog            1
Mary              1
Boston            1
Barclay           1
Chevy             1
Kingsley          1
Shakespeare       1


In [728]:
list(twitter_WERATEDOGS.text[twitter_WERATEDOGS.name=="a"])

['Here is a pupper approaching maximum borkdrive. Zooming at never before seen speeds. 14/10 paw-inspiring af \r\n(IG: puffie_the_chow) https://t.co/ghXBIIeQZF',
 'Here is a perfect example of someone who has their priorities in order. 13/10 for both owner and Forrest https://t.co/LRyMrU7Wfq',
 'Guys this is getting so out of hand. We only rate dogs. This is a Galapagos Speed Panda. Pls only send dogs... 10/10 https://t.co/8lpAGaZRFn',
 'This is a mighty rare blue-tailed hammer sherk. Human almost lost a limb trying to take these. Be careful guys. 8/10 https://t.co/TGenMeXreW',
 'Viewer discretion is advised. This is a terrible attack in progress. Not even in water (tragic af). 4/10 bad sherk https://t.co/L3U0j14N5R',
 'This is a carrot. We only rate dogs. Please only send in dogs. You all really should know this by now ...11/10 https://t.co/9e48aPrBm2',
 'This is a very rare Great Alaskan Bush Pupper. Hard to stumble upon without spooking. 12/10 would pet passionately https://t.co/xOB

In [729]:
twitter_WERATEDOGS[twitter_WERATEDOGS.text.str.contains("named")]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
603,798628517273620480,,,2016-11-15 20:47:30 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",RT @dog_rates: This a Norwegian Pewterschmidt named Tickles. Ears for days. 12/10 I care deeply for Tickles https://t.co/0aDF62KVP7,6.675094e+17,4196984000.0,2015-11-20 01:06:48 +0000,"https://twitter.com/dog_rates/status/667509364010450944/photo/1,https://twitter.com/dog_rates/status/667509364010450944/photo/1",12,10,,,,,
1853,675706639471788032,,,2015-12-12 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a Sizzlin Menorah spaniel from Brooklyn named Wylie. Lovable eyes. Chiller as hell. 10/10 and I'm out.. poof https://t.co/7E0AiJXPmI,,,,https://twitter.com/dog_rates/status/675706639471788032/photo/1,10,10,a,,,,
1955,673636718965334016,,,2015-12-06 22:54:44 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a Lofted Aphrodisiac Terrier named Kip. Big fan of bed n breakfasts. Fits perfectly. 10/10 would pet firmly https://t.co/gKlLpNzIl3,,,,https://twitter.com/dog_rates/status/673636718965334016/photo/1,10,10,a,,,,
2034,671743150407421952,,,2015-12-01 17:30:22 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a Tuscaloosa Alcatraz named Jacob (Yacōb). Loves to sit in swing. Stellar tongue. 11/10 look at his feet https://t.co/2IslQ8ZSc7,,,,https://twitter.com/dog_rates/status/671743150407421952/photo/1,11,10,a,,,,
2066,671147085991960577,,,2015-11-30 02:01:49 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a Helvetica Listerine named Rufus. This time Rufus will be ready for the UPS guy. He'll never expect it 9/10 https://t.co/34OhVhMkVr,,,,https://twitter.com/dog_rates/status/671147085991960577/photo/1,9,10,a,,,,
2116,670427002554466305,,,2015-11-28 02:20:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a Deciduous Trimester mix named Spork. Only 1 ear works. No seat belt. Incredibly reckless. 9/10 still cute https://t.co/CtuJoLHiDo,,,,https://twitter.com/dog_rates/status/670427002554466305/photo/1,9,10,a,,,,
2125,670361874861563904,,,2015-11-27 22:01:40 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a Rich Mahogany Seltzer named Cherokee. Just got destroyed by a snowball. Isn't very happy about it. 9/10 https://t.co/98ZBi6o4dj,,,,https://twitter.com/dog_rates/status/670361874861563904/photo/1,9,10,a,,,,
2128,670303360680108032,,,2015-11-27 18:09:09 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a Speckled Cauliflower Yosemite named Hemry. He's terrified of intruder dog. Not one bit comfortable. 9/10 https://t.co/yV3Qgjh8iN,,,,https://twitter.com/dog_rates/status/670303360680108032/photo/1,9,10,a,,,,
2146,669923323644657664,,,2015-11-26 16:59:01 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a spotted Lipitor Rumpelstiltskin named Alphred. He can't wait for the Turkey. 10/10 would pet really well https://t.co/6GUGO7azNX,,,,https://twitter.com/dog_rates/status/669923323644657664/photo/1,10,10,a,,,,
2161,669564461267722241,,,2015-11-25 17:13:02 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is a Coriander Baton Rouge named Alfredo. Loves to cuddle with smaller well-dressed dog. 10/10 would hug lots https://t.co/eCRdwouKCl,,,,https://twitter.com/dog_rates/status/669564461267722241/photo/1,10,10,a,,,,


In [730]:
twitter_WERATEDOGS.rating_numerator.value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [731]:
twitter_WERATEDOGS.text[twitter_WERATEDOGS.rating_numerator==1776]

979    This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh
Name: text, dtype: object

In [732]:
h=twitter_WERATEDOGS.text[twitter_WERATEDOGS.rating_numerator>20]
h=list(h)
h

['@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research',
 '@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10',
 '@markhoppus 182/10',
 "@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho",
 "RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…",
 'The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd',
 'Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \r\nKeep Sam smiling by clicking and sharing this link:\r\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx',
 "This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS",
 "This is Sophie. She's a Jubilant Bush Pupper. Super h*ckin rare. Appears at random 

In [733]:
twitter_WERATEDOGS.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [734]:
list(twitter_WERATEDOGS.text[twitter_WERATEDOGS.rating_denominator<10])

["@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho",
 'Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \r\nKeep Sam smiling by clicking and sharing this link:\r\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx',
 'This is an Albanian 3 1/2 legged  Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv']

In [735]:
all_columns=pd.Series(list(twitter_WERATEDOGS.doggo)+list(twitter_WERATEDOGS.floofer)+list(twitter_WERATEDOGS.pupper)+list(twitter_WERATEDOGS.puppo))
all_columns.value_counts()

None       9030
pupper      257
doggo        97
puppo        30
floofer      10
dtype: int64

In [736]:
image_predictions.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [737]:
image_predictions.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [738]:
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [739]:
image_predictions.p1[image_predictions.p1_dog==True].value_counts()

golden_retriever               150
Labrador_retriever             100
Pembroke                        89
Chihuahua                       83
pug                             57
chow                            44
Samoyed                         43
toy_poodle                      39
Pomeranian                      38
malamute                        30
cocker_spaniel                  30
French_bulldog                  26
miniature_pinscher              23
Chesapeake_Bay_retriever        23
Siberian_husky                  20
Staffordshire_bullterrier       20
German_shepherd                 20
Cardigan                        19
Eskimo_dog                      18
beagle                          18
Maltese_dog                     18
Shetland_sheepdog               18
Rottweiler                      17
Shih-Tzu                        17
Lakeland_terrier                17
Italian_greyhound               16
kuvasz                          16
Great_Pyrenees                  14
West_Highland_white_

In [740]:
image_predictions[image_predictions.p1=='seat_belt']

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
43,666776908487630848,https://pbs.twimg.com/media/CUDeDoWUYAAD-EM.jpg,1,seat_belt,0.375057,False,miniature_pinscher,0.167175,True,Chihuahua,0.086951,True
108,667878741721415682,https://pbs.twimg.com/media/CUTILFiWcAE8Rle.jpg,1,seat_belt,0.200373,False,miniature_pinscher,0.106003,True,schipperke,0.104733,True
198,669625907762618368,https://pbs.twimg.com/media/CUr9NjgU8AEpf5w.jpg,1,seat_belt,0.874502,False,golden_retriever,0.055408,True,Labrador_retriever,0.026854,True
235,670427002554466305,https://pbs.twimg.com/media/CU3VzVwWwAAAsst.jpg,1,seat_belt,0.952258,False,toy_terrier,0.038872,True,beagle,0.003226,True
522,676582956622721024,https://pbs.twimg.com/media/CWO0m8tUwAAB901.jpg,1,seat_belt,0.790028,False,Boston_bull,0.196307,True,French_bulldog,0.012429,True
551,677557565589463040,https://pbs.twimg.com/media/CWcrAVQWEAA6QMp.jpg,1,seat_belt,0.277257,False,Shih-Tzu,0.249017,True,Pekinese,0.209213,True
577,678740035362037760,https://pbs.twimg.com/media/CWtede2WIAAF_AJ.jpg,1,seat_belt,0.787164,False,sunglasses,0.045739,False,beagle,0.022525,True
642,681339448655802368,https://pbs.twimg.com/media/CXSanNkWkAAqR9M.jpg,1,seat_belt,0.532441,False,Labrador_retriever,0.094615,True,kuvasz,0.089863,True
657,682303737705140231,https://pbs.twimg.com/media/CXgHoLnWAAA8i52.jpg,1,seat_belt,0.997659,False,Lakeland_terrier,0.001731,True,Airedale,0.000204,True
740,687312378585812992,https://pbs.twimg.com/media/CYnS9VWW8AAeR8m.jpg,1,seat_belt,0.703561,False,Great_Dane,0.139909,True,Weimaraner,0.021112,True


In [741]:
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [742]:
list(twitter_WERATEDOGS[(twitter_WERATEDOGS.doggo=='doggo') & (twitter_WERATEDOGS.pupper=='pupper')].text)

['This is Dido. She\'s playing the lead role in "Pupper Stops to Catch Snow Before Resuming Shadow Box with Dried Apple." 13/10 (IG: didodoggo) https://t.co/m7isZrOBX7',
 'Here we have Burke (pupper) and Dexter (doggo). Pupper wants to be exactly like doggo. Both 12/10 would pet at same time https://t.co/ANBpEYHaho',
 'Like doggo, like pupper version 2. Both 11/10 https://t.co/9IxWAXFqze',
 "This is Bones. He's being haunted by another doggo of roughly the same size. 12/10 deep breaths pupper everything's fine https://t.co/55Dqe0SJNj",
 "This is Pinot. He's a sophisticated doggo. You can tell by the hat. Also pointier than your average pupper. Still 10/10 would pet cautiously https://t.co/f2wmLZTPHd",
 'Pupper butt 1, Doggo 0. Both 12/10 https://t.co/WQvcPEpH2u',
 'RT @dog_rates: Like father (doggo), like son (pupper). Both 12/10 https://t.co/pG2inLaOda',
 'RT @dog_rates: This is just downright precious af. 12/10 for both pupper and doggo https://t.co/o5J479bZUC',
 'Meet Maggie &amp; L

In [743]:
tweet_data.head()

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,favorite_count,favorited,full_text,geo,...,possibly_sensitive_appealable,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,truncated,user
0,,,2017-08-01 16:23:56,"[0, 85]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 892420639486877696, 'id_str': '892420639486877696', 'indices': [86, 109], 'media_url': 'http://pbs.twimg.com/media...","{'media': [{'id': 892420639486877696, 'id_str': '892420639486877696', 'indices': [86, 109], 'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg', 'media_url_https': 'https://pbs.twimg.com...",39492,False,This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,...,0.0,,,,8842,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."
1,,,2017-08-01 00:17:27,"[0, 138]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 892177413194625024, 'id_str': '892177413194625024', 'indices': [139, 162], 'media_url': 'http://pbs.twimg.com/medi...","{'media': [{'id': 892177413194625024, 'id_str': '892177413194625024', 'indices': [139, 162], 'media_url': 'http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg', 'media_url_https': 'https://pbs.twimg.co...",33786,False,"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,...,0.0,,,,6480,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."
2,,,2017-07-31 00:18:03,"[0, 121]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 891815175371796480, 'id_str': '891815175371796480', 'indices': [122, 145], 'media_url': 'http://pbs.twimg.com/medi...","{'media': [{'id': 891815175371796480, 'id_str': '891815175371796480', 'indices': [122, 145], 'media_url': 'http://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg', 'media_url_https': 'https://pbs.twimg.co...",25445,False,This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,...,0.0,,,,4301,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."
3,,,2017-07-30 15:58:51,"[0, 79]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 891689552724799489, 'id_str': '891689552724799489', 'indices': [80, 103], 'media_url': 'http://pbs.twimg.com/media...","{'media': [{'id': 891689552724799489, 'id_str': '891689552724799489', 'indices': [80, 103], 'media_url': 'http://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg', 'media_url_https': 'https://pbs.twimg.com...",42863,False,This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,...,0.0,,,,8925,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."
4,,,2017-07-29 16:00:24,"[0, 138]","{'hashtags': [{'text': 'BarkWeek', 'indices': [129, 138]}], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 891327551943041024, 'id_str': '891327551943041024', 'indices': [139, 16...","{'media': [{'id': 891327551943041024, 'id_str': '891327551943041024', 'indices': [139, 162], 'media_url': 'http://pbs.twimg.com/media/DF6hr6AVYAAZ8G8.jpg', 'media_url_https': 'https://pbs.twimg.co...",41016,False,"This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,...,0.0,,,,9721,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', 'description': 'Only Legit Source for Professional ..."


In [744]:
tweet_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2352 entries, 0 to 2351
Data columns (total 31 columns):
contributors                     0 non-null float64
coordinates                      0 non-null float64
created_at                       2352 non-null datetime64[ns]
display_text_range               2352 non-null object
entities                         2352 non-null object
extended_entities                2073 non-null object
favorite_count                   2352 non-null int64
favorited                        2352 non-null bool
full_text                        2352 non-null object
geo                              0 non-null float64
id                               2352 non-null int64
id_str                           2352 non-null int64
in_reply_to_screen_name          78 non-null object
in_reply_to_status_id            78 non-null float64
in_reply_to_status_id_str        78 non-null float64
in_reply_to_user_id              78 non-null float64
in_reply_to_user_id_str          78 n

#### Quality
##### Table `twitter_WERATEDOGS` 
- Retweet Twitters should be removed from table
- Tweet ID shold be string type
- in_reply and retweet ID is float, should change to integer and then transfer to string
- time-stamp is string, should be datetime type
- content source has <ahref=xxxx, > is redendant, and column should be divided into two parts, webSource and appSource
- incorrectly choose 'a','an','such' as dog's name
- miss lot's of dog's names which are after "named xxx"
- Rating numerator value forget to take float into consider, e.g 9.75 is shown 75 in table
- Rating Denominator 2 and 0 are two wrong ratings, chage revise these two rows 
- "/r/n"exists in many text


#### Tidiness
-  In table `twitter_WERATEDOGS`, last 4 columns should be combine to one colume called "dog_stage"
-  In table `image_predictions`, rows only with p1 predeictions should be retained, other rows are redundant
-  In table `tweet_data`, this table contains lots of redundant information, extract tweet_id, favorite_count and retweet_count to form a new table 

## Cleaning

In [745]:
twitter_WERATEDOGS_clean=twitter_WERATEDOGS.copy()
image_predictions_clean=image_predictions.copy()
tweet_data_clean=tweet_data.copy()

### Tidyness

#### In table `twitter_WERATEDOGS`, last 4 columns should be combine to one colume called "dog_stage"

##### Definition
* combine those 4 columns into one column, if any one of four is not 'none', then use that name to represent this row

##### Code

In [746]:
# this is code for tidyness
twitter_WERATEDOGS_clean['dog_stage']=twitter_WERATEDOGS_clean.doggo+twitter_WERATEDOGS_clean.floofer+twitter_WERATEDOGS_clean.pupper+twitter_WERATEDOGS_clean.puppo
twitter_WERATEDOGS_clean.dog_stage=twitter_WERATEDOGS_clean.dog_stage.str.replace("None","")
twitter_WERATEDOGS_clean.dog_stage=twitter_WERATEDOGS_clean.dog_stage.replace('',np.NaN)
twitter_WERATEDOGS_clean.drop(['doggo','floofer','pupper','puppo'],axis=1,inplace=True)

##### Test

In [747]:
twitter_WERATEDOGS_clean.dog_stage.value_counts()

pupper          245
doggo            83
puppo            29
doggopupper      12
floofer           9
doggofloofer      1
doggopuppo        1
Name: dog_stage, dtype: int64

In [748]:
twitter_WERATEDOGS_clean.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,dog_stage
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,,,https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,,,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,,,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,,,"https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,


#### In table image_predictions, rows only with p1 predeictions should be retained, other rows are redundant

#### Definition
use melt to transfer original columns to new one, and drop original columns

#### Code

In [749]:
#transfer p1, p2, p3 to prediction type
x1=pd.melt(image_predictions_clean,id_vars=['tweet_id','jpg_url','img_num'],value_vars=['p1','p2','p3'],var_name='prediction_type',value_name='result')

#transfer p1_conf, p2_conf, p3_conf to prediction type and value, replace p1_conf to p1...
x3=pd.melt(image_predictions_clean,id_vars=['tweet_id','jpg_url','img_num'],value_vars=['p1_dog','p2_dog','p3_dog'],var_name='prediction_type',value_name='isDog?')
x3['prediction_type']=x3['prediction_type'].replace(['p1_dog','p2_dog','p3_dog'],['p1','p2','p3'])

#transfer p1_dog, p2_dog, p3_dog to prediction type and value, replace p1_dog to p1...
x2=pd.melt(image_predictions_clean,id_vars=['tweet_id','jpg_url','img_num'],value_vars=['p1_conf','p2_conf','p3_conf'],var_name='prediction_type',value_name='confidenceLevel')
x2['prediction_type']=x2['prediction_type'].replace(['p1_conf','p2_conf','p3_conf'],['p1','p2','p3'])

#merge x1, x2, x3 to new table
t1=pd.merge(x1,x2,on=['tweet_id','jpg_url','img_num','prediction_type'])
image_predictions_clean=pd.merge(t1,x3,on=['tweet_id','jpg_url','img_num','prediction_type'])

#only retain rows the prediction_type == p1
image_predictions_clean=image_predictions_clean[image_predictions_clean.prediction_type=='p1']

##### Test

In [750]:
image_predictions_clean.sample(10)

Unnamed: 0,tweet_id,jpg_url,img_num,prediction_type,result,confidenceLevel,isDog?
530,676819651066732545,https://pbs.twimg.com/media/CWSL4W8WsAAE4KU.jpg,2,p1,rain_barrel,0.625555,False
2040,885167619883638784,https://pbs.twimg.com/media/DEi_N9qXYAAgEEw.jpg,4,p1,malamute,0.812482,True
1372,762464539388485633,https://pbs.twimg.com/media/CpTRc4DUEAAYTq6.jpg,4,p1,chow,0.999953,True
1545,792050063153438720,https://pbs.twimg.com/media/Cv3tU38WcAASFas.jpg,2,p1,komondor,0.942856,True
1447,776088319444877312,https://pbs.twimg.com/media/CsU4NKkW8AUI5eG.jpg,3,p1,web_site,0.999916,False
1184,738883359779196928,https://pbs.twimg.com/media/CkEKe3QWYAAwoDy.jpg,2,p1,Labrador_retriever,0.691137,True
1366,761672994376806400,https://pbs.twimg.com/ext_tw_video_thumb/761672828462718981/pu/img/R00UYAAWB3GtuHdI.jpg,1,p1,gondola,0.318851,False
615,680176173301628928,https://pbs.twimg.com/media/CXB4nWnWEAAhLTX.jpg,1,p1,Christmas_stocking,0.207547,False
890,699323444782047232,https://pbs.twimg.com/media/CbR-9edXIAEHJKi.jpg,1,p1,Labrador_retriever,0.309696,True
1547,792773781206999040,https://pbs.twimg.com/media/CwB_i-zXEAEiP29.jpg,1,p1,Yorkshire_terrier,0.912804,True


In [751]:
image_predictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2074
Data columns (total 7 columns):
tweet_id           2075 non-null int64
jpg_url            2075 non-null object
img_num            2075 non-null int64
prediction_type    2075 non-null object
result             2075 non-null object
confidenceLevel    2075 non-null float64
isDog?             2075 non-null bool
dtypes: bool(1), float64(1), int64(2), object(3)
memory usage: 115.5+ KB


##### In table tweet_data, this table contains lots of redundant information, extract tweet_id, favorite_count and retweet_count to form a new table

#### Definition
create a blank list df_list, append required information for each row

#### Code

In [752]:
df_list=[]
for i in range(len(tweet_data)):
    try:
        df_list.append({'tweet_id':re.search(r'\d{18}',tweet_data['extended_entities'][i]['media'][0]['expanded_url']).group(),
                       'favorite_count':tweet_data_clean['favorite_count'][i],
                       'retweet_count':tweet_data_clean['retweet_count'][i]})
    except (RuntimeError, TypeError, NameError):
        pass

In [753]:
tweet_data_clean_new = pd.DataFrame(df_list, columns = ['tweet_id', 'favorite_count', 'retweet_count'])
tweet_data_clean_new.tweet_id=tweet_data_clean_new.tweet_id.astype('str')

#### Test

In [754]:
tweet_data_clean_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2073 entries, 0 to 2072
Data columns (total 3 columns):
tweet_id          2073 non-null object
favorite_count    2073 non-null int64
retweet_count     2073 non-null int64
dtypes: int64(2), object(1)
memory usage: 48.7+ KB


In [755]:
tweet_data_clean_new.head()

Unnamed: 0,tweet_id,favorite_count,retweet_count
0,892420643555336193,39492,8842
1,892177421306343426,33786,6480
2,891815181378084864,25445,4301
3,891689557279858688,42863,8925
4,891327558926688256,41016,9721


### Quality

#### Retweet Twitters should be removed from table

##### Definition
* Remove Twitters when 'retweeted_status_id' not null 

##### Code

In [756]:
twitter_WERATEDOGS_clean=twitter_WERATEDOGS_clean[twitter_WERATEDOGS_clean.retweeted_status_id.isnull()]
twitter_WERATEDOGS_clean = twitter_WERATEDOGS_clean.reset_index(drop=True)

In [757]:
twitter_WERATEDOGS_clean=twitter_WERATEDOGS_clean.drop(['retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp'],axis=1)

##### Code

In [758]:
twitter_WERATEDOGS_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2175 entries, 0 to 2174
Data columns (total 11 columns):
tweet_id                 2175 non-null int64
in_reply_to_status_id    78 non-null float64
in_reply_to_user_id      78 non-null float64
timestamp                2175 non-null object
source                   2175 non-null object
text                     2175 non-null object
expanded_urls            2117 non-null object
rating_numerator         2175 non-null int64
rating_denominator       2175 non-null int64
name                     2175 non-null object
dog_stage                344 non-null object
dtypes: float64(2), int64(3), object(6)
memory usage: 187.0+ KB



#### Tweet ID shold be string type"

##### Definition
* change Tweet ID type from int to object in both table

##### Code

In [759]:
twitter_WERATEDOGS_clean.tweet_id=twitter_WERATEDOGS_clean.tweet_id.astype(str)
image_predictions_clean.tweet_id=image_predictions_clean.tweet_id.astype(str)

##### Test

In [760]:
twitter_WERATEDOGS_clean.tweet_id.head()

0    892420643555336193
1    892177421306343426
2    891815181378084864
3    891689557279858688
4    891327558926688256
Name: tweet_id, dtype: object

In [761]:
image_predictions_clean.tweet_id.head()

0    666020888022790149
1    666029285002620928
2    666033412701032449
3    666044226329800704
4    666049248165822465
Name: tweet_id, dtype: object

#### in_reply ID is float

##### Definition
* change to integer and then transfer to string

##### Code

In [762]:
#in_reply_to_status_id
twitter_WERATEDOGS_clean.in_reply_to_status_id[twitter_WERATEDOGS_clean.in_reply_to_status_id.notna()]=twitter_WERATEDOGS_clean.in_reply_to_status_id[twitter_WERATEDOGS_clean.in_reply_to_status_id.notna()].astype(int).astype(str)

#in_reply_to_user_id
twitter_WERATEDOGS_clean.in_reply_to_user_id[twitter_WERATEDOGS_clean.in_reply_to_user_id.notna()]=twitter_WERATEDOGS_clean.in_reply_to_user_id[twitter_WERATEDOGS_clean.in_reply_to_user_id.notna()].astype(int).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


##### Test

In [763]:
twitter_WERATEDOGS_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2175 entries, 0 to 2174
Data columns (total 11 columns):
tweet_id                 2175 non-null object
in_reply_to_status_id    78 non-null object
in_reply_to_user_id      78 non-null object
timestamp                2175 non-null object
source                   2175 non-null object
text                     2175 non-null object
expanded_urls            2117 non-null object
rating_numerator         2175 non-null int64
rating_denominator       2175 non-null int64
name                     2175 non-null object
dog_stage                344 non-null object
dtypes: int64(2), object(9)
memory usage: 187.0+ KB


In [764]:
twitter_WERATEDOGS_clean.sample(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,dog_stage
1487,682697186228989953,,,2015-12-31 22:57:47 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",NAAAAAAA ZAPENYAAAAA MABADI-CHIBAWAAA 12/10 https://t.co/Ny4iM6FDtz,https://twitter.com/dog_rates/status/682697186228989953/photo/1,12,10,,
1616,677314812125323265,,,2015-12-17 02:30:09 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","Meet Tassy &amp; Bee. Tassy is pretty chill, but Bee is convinced the Ruffles are haunted. 10/10 &amp; 11/10 respectively https://t.co/fgORpmTN9C","https://twitter.com/dog_rates/status/677314812125323265/photo/1,https://twitter.com/dog_rates/status/677314812125323265/photo/1",10,10,Tassy,
233,841077006473256960,,,2017-03-13 00:02:39 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Dawn. She's just checking pup on you. Making sure you're doing okay. 12/10 she's here if you need her https://t.co/XKJrmO4fAQ,https://twitter.com/dog_rates/status/841077006473256960/photo/1,12,10,Dawn,
2011,668960084974809088,,,2015-11-24 01:11:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Meet Jaycob. He got scared of the vacuum. Hide &amp; seek champ. Almost better than Kony. Solid shampoo selection. 10/10 https://t.co/952hUV6RiK,https://twitter.com/dog_rates/status/668960084974809088/photo/1,10,10,Jaycob,
1292,693993230313091072,,,2016-02-01 03:04:14 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine - Make a Scene</a>",These lil fellas are the best of friends. 12/10 for both. 1 like = 1 friend (vid by @CassieBrookee15) https://t.co/gzRghPC61H,https://vine.co/v/i5ETazP5hrm,12,10,,
476,800751577355128832,,,2016-11-21 17:23:47 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Say hello to Mauve and Murphy. They're rather h*ckin filthy. Preferred nap over bath. Both 12/10 https://t.co/4UwCTW3lXG,"https://twitter.com/dog_rates/status/800751577355128832/photo/1,https://twitter.com/dog_rates/status/800751577355128832/photo/1,https://twitter.com/dog_rates/status/800751577355128832/photo/1",12,10,Mauve,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Meet Jax. He enjoys ice cream so much he gets nervous around it. 13/10 help Jax enjoy more things by clicking below\r\n\r\nhttps://t.co/Zr4hWfAs1H https://t.co/tVJBRMnhxl,"https://gofundme.com/ydvmve-surgery-for-jax,https://twitter.com/dog_rates/status/890971913173991426/photo/1",13,10,Jax,
1559,679527802031484928,,,2015-12-23 05:03:47 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This little pupper just arrived. 11/10 would snug https://t.co/DA5aqnSGfB,https://twitter.com/dog_rates/status/679527802031484928/photo/1,11,10,,pupper
1893,671109016219725825,,,2015-11-29 23:30:32 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Toby. He asked for chocolate cake for his birthday but was given vanilla instead. 8/10 it'll be ok Toby https://t.co/sYi2G0he4H,https://twitter.com/dog_rates/status/671109016219725825/photo/1,8,10,Toby,
1647,676440007570247681,,,2015-12-14 16:34:00 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Hope your Monday isn't too awful. Here's two baseball puppers. 11/10 for each https://t.co/dB0H9hdZai,"https://twitter.com/dog_rates/status/676440007570247681/photo/1,https://twitter.com/dog_rates/status/676440007570247681/photo/1",11,10,,


#### time-stamp is string, should be datetime type

##### Definition
* use to_datetime to change type

##### Code

In [765]:
twitter_WERATEDOGS_clean.timestamp[1]

'2017-08-01 00:17:27 +0000'

In [766]:
twitter_WERATEDOGS_clean.timestamp=pd.to_datetime(twitter_WERATEDOGS_clean.timestamp,format="%Y-%m-%d %H:%M:%S +0000")

##### Test

In [767]:
twitter_WERATEDOGS_clean.timestamp.sample(10)

1332   2016-01-25 00:26:41
960    2016-05-05 01:35:26
613    2016-09-21 01:39:11
1628   2015-12-16 00:09:23
1602   2015-12-18 03:54:25
1068   2016-03-20 20:36:28
1044   2016-03-28 01:10:13
143    2017-05-04 17:30:24
1441   2016-01-07 03:30:07
1426   2016-01-09 04:34:45
Name: timestamp, dtype: datetime64[ns]

#### content source has <ahref=xxxx, > is redendant, and column should be divided into two parts, webSource and appSource

##### Definition
* use extract method to get correct str from source and create two columns

##### Code

In [768]:
twitter_WERATEDOGS_clean.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2042
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       31
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

In [769]:
twitter_WERATEDOGS_clean['webSource']=twitter_WERATEDOGS_clean.source.str.extract('(http.+)\" rel',expand=True)

twitter_WERATEDOGS_clean['appSource']=twitter_WERATEDOGS_clean.source.str.extract('>(.+)<',expand=True)

twitter_WERATEDOGS_clean.drop('source',axis=1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,text,expanded_urls,rating_numerator,rating_denominator,name,dog_stage,webSource,appSource
0,892420643555336193,,,2017-08-01 16:23:56,This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,,http://twitter.com/download/iphone,Twitter for iPhone
1,892177421306343426,,,2017-08-01 00:17:27,"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,,http://twitter.com/download/iphone,Twitter for iPhone
2,891815181378084864,,,2017-07-31 00:18:03,This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,,http://twitter.com/download/iphone,Twitter for iPhone
3,891689557279858688,,,2017-07-30 15:58:51,This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,,http://twitter.com/download/iphone,Twitter for iPhone
4,891327558926688256,,,2017-07-29 16:00:24,"This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f","https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,,http://twitter.com/download/iphone,Twitter for iPhone
5,891087950875897856,,,2017-07-29 00:08:17,Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek https://t.co/kQ04fDDRmh,https://twitter.com/dog_rates/status/891087950875897856/photo/1,13,10,,,http://twitter.com/download/iphone,Twitter for iPhone
6,890971913173991426,,,2017-07-28 16:27:12,Meet Jax. He enjoys ice cream so much he gets nervous around it. 13/10 help Jax enjoy more things by clicking below\r\n\r\nhttps://t.co/Zr4hWfAs1H https://t.co/tVJBRMnhxl,"https://gofundme.com/ydvmve-surgery-for-jax,https://twitter.com/dog_rates/status/890971913173991426/photo/1",13,10,Jax,,http://twitter.com/download/iphone,Twitter for iPhone
7,890729181411237888,,,2017-07-28 00:22:40,When you watch your owner call another dog a good boy but then they turn back to you and say you're a great boy. 13/10 https://t.co/v0nONBcwxq,"https://twitter.com/dog_rates/status/890729181411237888/photo/1,https://twitter.com/dog_rates/status/890729181411237888/photo/1",13,10,,,http://twitter.com/download/iphone,Twitter for iPhone
8,890609185150312448,,,2017-07-27 16:25:51,This is Zoey. She doesn't want to be one of the scary sharks. Just wants to be a snuggly pettable boatpet. 13/10 #BarkWeek https://t.co/9TwLuAGH0b,https://twitter.com/dog_rates/status/890609185150312448/photo/1,13,10,Zoey,,http://twitter.com/download/iphone,Twitter for iPhone
9,890240255349198849,,,2017-07-26 15:59:51,This is Cassie. She is a college pup. Studying international doggo communication and stick theory. 14/10 so elegant much sophisticate https://t.co/t1bfwz5S2A,https://twitter.com/dog_rates/status/890240255349198849/photo/1,14,10,Cassie,doggo,http://twitter.com/download/iphone,Twitter for iPhone


##### Test

In [770]:
twitter_WERATEDOGS_clean.webSource.value_counts()

http://twitter.com/download/iphone              2042
http://vine.co                                    91
http://twitter.com                                31
https://about.twitter.com/products/tweetdeck      11
Name: webSource, dtype: int64

In [771]:
twitter_WERATEDOGS_clean.appSource.value_counts()

Twitter for iPhone     2042
Vine - Make a Scene      91
Twitter Web Client       31
TweetDeck                11
Name: appSource, dtype: int64

#### incorrectly choose 'a','an','such' as dog's name

##### Definition
* replace 'a', 'an', 'such' as NaN

##### Code

In [772]:
twitter_WERATEDOGS_clean.name=twitter_WERATEDOGS_clean.name.replace(['a','an','such','None','the'],[np.nan,np.nan,np.nan,np.nan,np.nan])

##### Test

In [773]:
twitter_WERATEDOGS_clean.name.value_counts()

Charlie         11
Lucy            11
Oliver          10
Cooper          10
Penny            9
Tucker           9
Winston          8
Sadie            8
Lola             8
Daisy            7
Toby             7
Bailey           6
Stanley          6
Bella            6
Koda             6
Oscar            6
Jax              6
Bo               6
Bentley          5
Buddy            5
Chester          5
Dave             5
Scout            5
Louis            5
Leo              5
Milo             5
Rusty            5
Dexter           4
Winnie           4
Gus              4
                ..
Jazz             1
Willow           1
light            1
Yoda             1
Pip              1
Derby            1
Ed               1
Shakespeare      1
Cal              1
Butters          1
Ashleigh         1
Wesley           1
Wishes           1
Bookstore        1
Suki             1
Murphy           1
Karma            1
Tove             1
Birf             1
Astrid           1
Goliath          1
unacceptable

#### miss lot's of dog's names which are after "named xxx"

##### Definition
* use to_datetime to change type

##### Code

In [774]:
twitter_WERATEDOGS_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2175 entries, 0 to 2174
Data columns (total 13 columns):
tweet_id                 2175 non-null object
in_reply_to_status_id    78 non-null object
in_reply_to_user_id      78 non-null object
timestamp                2175 non-null datetime64[ns]
source                   2175 non-null object
text                     2175 non-null object
expanded_urls            2117 non-null object
rating_numerator         2175 non-null int64
rating_denominator       2175 non-null int64
name                     1425 non-null object
dog_stage                344 non-null object
webSource                2175 non-null object
appSource                2175 non-null object
dtypes: datetime64[ns](1), int64(2), object(10)
memory usage: 221.0+ KB


In [775]:
#extract name after named, and assign this series to sereise called 'newName'
newName=twitter_WERATEDOGS_clean.text.str.extract('[nN]amed ([a-zA-Z]+)\.',expand=True)[0]

#replace name in twitter_WERATEDOGS_clean if newName exist
for x in range(len(newName)):
    if pd.notnull(newName[x]):
        twitter_WERATEDOGS_clean.loc[x,'name']=newName.loc[x]

##### Test

In [776]:
newName[newName.notnull()]

1674        Wylie
1776          Kip
1887        Rufus
1937        Spork
1946     Cherokee
1949        Hemry
1967      Alphred
1982      Alfredo
1987         Zeus
2012        Leroi
2025        Berta
2039         Chuk
2048         Guss
2056      Alfonso
2070       Cheryl
2076      Jessiga
2083        Klint
2088      Tickles
2092         Kohl
2123         Pepe
2130    Octaviath
2133         Johm
Name: 0, dtype: object

In [779]:
twitter_WERATEDOGS_clean.name[1674]

'Wylie'

In [780]:
twitter_WERATEDOGS_clean.name[2133]

'Johm'

#### Rating numerator value forget to take float into consider, e.g 9.75 is shown 75 in table

##### Definition
* use to_datetime to change type

##### Code

In [None]:
twitter_WERATEDOGS_clean.rating_numerator=twitter_WERATEDOGS_clean.text.str.extract('(\d+\.?\d*)/\d',expand=True)[0].astype(float)

##### Test

In [None]:
twitter_WERATEDOGS_clean.rating_numerator.value_counts()

#### Rating Denominator 2 and 0 are two wrong ratings, revise these two rows

##### Definition
* Based on text of these two rows, change rating numerator and denminator to correct value

##### Code

In [None]:
list(twitter_WERATEDOGS_clean[twitter_WERATEDOGS_clean.rating_denominator==0].text)

In [None]:
twitter_WERATEDOGS_clean.rating_denominator=twitter_WERATEDOGS_clean.rating_denominator.replace(0,10)
twitter_WERATEDOGS_clean.rating_numerator=twitter_WERATEDOGS_clean.rating_numerator.replace(960,13)

In [None]:
list(twitter_WERATEDOGS_clean[twitter_WERATEDOGS_clean.rating_denominator==2].text)

In [None]:
twitter_WERATEDOGS_clean[twitter_WERATEDOGS_clean.rating_denominator==2]

In [None]:
twitter_WERATEDOGS_clean.loc[twitter_WERATEDOGS_clean.rating_denominator==2,'rating_numerator']=9
twitter_WERATEDOGS_clean.loc[twitter_WERATEDOGS_clean.rating_denominator==2,'rating_denominator']=10

##### Test

In [None]:
twitter_WERATEDOGS_clean.rating_denominator.value_counts()

#### "\r\n"exists in many text

##### Definition
* replace \r\n with " "

##### Code

In [None]:
twitter_WERATEDOGS_clean.text=twitter_WERATEDOGS_clean.text.str.replace('\r\n',' ')

##### Test

In [None]:
twitter_WERATEDOGS_clean.text.str.contains('\r\n').value_counts()

In [None]:
tweet_data.head()

In [None]:
twitter_WERATEDOGS_clean.info()

In [None]:
image_predictions_clean.info()

In [None]:
tweet_data_clean_new.info()

## Gathering2

In [None]:
# create a new table called image_predictions_clean2, contains tweet_id, prediction_type, 
# result, conf, isDog 5 columns and predictiontype is p1
image_predictions_clean2=image_predictions_clean[image_predictions_clean.prediction_type=='p1']

In [None]:
image_predictions_clean2=image_predictions_clean2.drop(['jpg_url','img_num'],axis=1)

## Accessing2

In [None]:
tweet_data_clean_new[tweet_data_clean_new.tweet_id.duplicated(keep=False)].sort_values(by=['tweet_id'])

In [None]:
tweet_data_clean_new[tweet_data_clean_new.tweet_id.duplicated(keep=False)].favorite_count.value_counts()

In [None]:
tweet_data_clean_new.describe()

In [None]:
tweet_data_clean_new.favorite_count.value_counts().sort_index()

In [None]:
tweet_data_clean_new.retweet_count.value_counts().sort_index()

#### Quality
##### Table `tweet_data_clean_new` 
- 128 Duplicated tweet_id, and favourite_count of half rows are 0 
- favourite_count in 79 rows are 0 


#### Tidiness
-  These three table should be combiend to one new table by tweet_id

## Cleaning 2

### Quality

##### 128 Duplicated tweet_id, and favourite_count of half rows are 0
##### favourite_count in 79 rows are 0

#### Definition
drop all rows with favorite_count == 0
#### Code

In [None]:
tweet_data_clean_new=tweet_data_clean_new[tweet_data_clean_new.favorite_count != 0]

#### Test

In [None]:
tweet_data_clean_new.tweet_id.value_counts()

## Tidyness

##### These three table should be combiend to one new table by tweet_id

#### Definition
merge tweet_data_clean_new and image_predictions_clean2 to twitter_WERATEDOGS_clean

#### Code

In [None]:
twitter_archive_master=pd.merge(twitter_WERATEDOGS_clean,tweet_data_clean_new,on='tweet_id',how='left')
twitter_archive_master=pd.merge(twitter_archive_master,image_predictions_clean2,on='tweet_id',how='left')

In [None]:
twitter_archive_master.to_csv('twitter_archive_master.csv',index=False)

#### Test

In [None]:
twitter_archive_master.head()

In [None]:
twitter_archive_master.info()

# Data Analysis

In [None]:
twitter_archive_master['rating']=twitter_archive_master.rating_numerator/twitter_archive_master.rating_denominator

#### Compare ratings of dogs and non-dogs

In [None]:
twitter_archive_master[twitter_archive_master['isDog?']==True].rating.describe()

In [None]:
twitter_archive_master[twitter_archive_master['isDog?']==False].rating.describe()

- H0: ratings are inrelated to dogs or not, $\mu_1 = \mu_2$</p>
- H1: ratings are related to dogs or not, $\mu_1 \neq \mu_2$</p>

In [None]:
rating_dog=list(twitter_archive_master[twitter_archive_master['isDog?']==True].rating)
rating_nodog=list(twitter_archive_master[twitter_archive_master['isDog?']==False].rating)

In [None]:
# Calculate the T-test for means of two independant samples
[t_Ind,ptwo_Ind]=stats.ttest_ind(rating_dog,rating_nodog, equal_var = False)
# Show t-statistic value
t_Ind

In [None]:
ptwo_Ind

if we assume significance level, $\alpha=0.05$

ptwo_Ind > 0.05, so we <strong>cannot reject H0 assumption</strong>

#### Compare ratings of different type of dogs

In [None]:
# Only consider data with confidenceLevel higher than 0.9
twitter_dog=twitter_archive_master[(twitter_archive_master['isDog?']==True) & (twitter_archive_master.confidenceLevel>0.9)]
twitter_dog.result.value_counts()

#####  consider top 5 types

In [None]:
rating_dogType={}
rating_golden_retriever=twitter_dog[twitter_dog.result=='golden_retriever'].rating
rating_Pembroke =twitter_dog[twitter_dog.result=='Pembroke'].rating
rating_pug=twitter_dog[twitter_dog.result=='pug'].rating
rating_Samoyed=twitter_dog[twitter_dog.result=='Samoyed'].rating
rating_Labrador_retriever=twitter_dog[twitter_dog.result=='Labrador_retriever'].rating

In [None]:
rating_golden_retriever.describe()

In [None]:
rating_Pembroke.describe()

In [None]:
rating_pug.describe()

In [None]:
rating_Samoyed.describe()

In [None]:
rating_Labrador_retriever.describe()

In [None]:
plt.figure(figsize=(16,10))
plt.subplot(3,2,1)
plt.hist(rating_golden_retriever, 6, facecolor='green', alpha=0.75)
plt.title('golden_retriever')
plt.subplot(3,2,2)
plt.hist(rating_Pembroke, 6, facecolor='green', alpha=0.75)
plt.title('Pembroke')
plt.subplot(3,2,3)
plt.hist(rating_pug, 6, facecolor='green', alpha=0.75)
plt.title('pug')
plt.subplot(3,2,4)
plt.hist(rating_Samoyed, 6, facecolor='green', alpha=0.75)
plt.title('Samoyed')
plt.subplot(3,2,5)
plt.hist(rating_Labrador_retriever, 6, facecolor='green', alpha=0.75)
plt.title('Labrador_retriever')

- H0: ratings of five dogs are same. $\mu_1 = \mu_2 = \mu_3 = \mu_4 = \mu_5$</p>
- H1: not all ratings are same</p>

In [None]:
# f-test for all five group
stats.f_oneway(rating_golden_retriever,rating_Pembroke,rating_pug,rating_Samoyed,rating_Labrador_retriever)

- H0: ratings of five dogs are same. $\mu_1 = \mu_2 = \mu_4 = \mu_5$</p>
- H1: not all ratings are same</p>

In [None]:
# f-test for 4 groups, except pug
stats.f_oneway(rating_golden_retriever,rating_Pembroke,rating_Samoyed,rating_Labrador_retriever)

##### We can find that ratings of golden_retiever, Pembroke, Samoyed, Labrador_retriever  has no statistics difference, but Pug's rating is lower than other 4 popular dog types

In [None]:
data_ratings=list(twitter_archive_master.rating[twitter_archive_master.favorite_count.notnull()])
data_favorite_count=list(twitter_archive_master.favorite_count[twitter_archive_master.favorite_count.notnull()])
data_retweet_count=list(twitter_archive_master.retweet_count[twitter_archive_master.favorite_count.notnull()])
data=pd.DataFrame({'ratings':data_ratings,'favorite_count':data_favorite_count,'retweet_count':data_retweet_count})

In [None]:
# The parameter 'c' is passed to plt.scatter and will control the color
plotting.scatter_matrix(data, marker='o',figsize=(16, 8))

fig = plt.gcf()
#fig.suptitle("blue: setosa, green: versicolor, red: virginica", size=13)

In [None]:
data.sort_values(by=['ratings'],ascending=False)

#### three weird rating value, 177.6, 42 and 3.43, thsethree rows should be removed

In [None]:
data=data[data.ratings<3]

In [None]:
data.describe()

In [None]:
# The parameter 'c' is passed to plt.scatter and will control the color)
plotting.scatter_matrix(data, marker='o',figsize=(16, 8))
fig = plt.gcf()


In [None]:
## Without a constant

import statsmodels.api as sm

X =data["favorite_count"]
y = data["retweet_count"]
X = sm.add_constant(X) ## let's add an intercept (beta_0) to our model
# Note the difference in argument order

model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
predictions = model.predict(X)

# Print out the statistics
model.summary()