# Project: Wrangling and Analyze Data

## Data Gathering



In [1]:
# import all Necessary  packages for this project
import requests
import tweepy
import pandas as pd
import numpy as np
import json
from tweepy import OAuthHandler
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

1. Directly download the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv)

In [2]:
# read the data WeRateDogs into a pandas DataFrame.
df_1=pd.read_csv("twitter-archive-enhanced.csv")


2. Use the Requests library to download the tweet image prediction (image_predictions.tsv)

In [3]:
# Download the tweet image prediction via resquests library
r=requests.get("https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv")

# create file image prediction.tsv and write 
with open("image_predictions.tsv", mode="wb") as image:
    image.write(r.content)

# read the data image prediction into a pandas DataFrame.    
df_2=pd.read_csv("image_predictions.tsv",sep='\t')


3. Use the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [4]:
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden(---) to comply with Twitter's API terms and conditions
consumer_key = '---'
consumer_secret = '---'
access_token = '---'
access_token_secret = '---'


auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth,wait_on_rate_limit=True)

# List of dictionaries to build and convert to a DataFrame later
df_list=[]

# list of tweet_id in the WeRateDogs
tweet_id=list(df_1.tweet_id.values)

# fails connections
fails_dict = {}

with open("tweet_json.txt", mode="w") as f:
    for i in range(len(tweet_id)):
        try:
            tweet=api.get_status(tweet_id[i], tweet_mode='extended')
            f.write(json.dumps(tweet._json))
            retweet_count=tweet._json["retweet_count"]
            favorite_count=tweet._json['favorite_count']
            #storing all data in list
            df_list.append({'tweet_id': tweet_id[i],'retweet_count': retweet_count,'favorite_count': favorite_count})
        except tweepy.TweepyException as e:
            fails_dict[tweet_id[i]]=e
            pass
       
    
    
df_reteweet_and_favorite= pd.DataFrame(df_list, columns = ['tweet_id', 'retweet_count', 'favorite_count']) 
   

Rate limit reached. Sleeping for: 461
Rate limit reached. Sleeping for: 366


In [5]:
# number of sucess and fails row Respectively 
df_reteweet_and_favorite.shape[0],len(fails_dict)

(2328, 28)

## Assessing Data

#### I. tweet image prediction data

In [44]:
# display first few lines of tweet image prediction data
df_2.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [45]:
# display last few lines of tweet image prediction data
df_2.tail()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.22577,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True
2074,892420643555336193,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1,orange,0.097049,False,bagel,0.085851,False,banana,0.07611,False


In [249]:
# display info of tweet image prediction 
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [265]:
# Incorrect image predictions : return nnumber of each columns
df_2.query('(p1_dog == False) & (p2_dog == False) & (p3_dog == False)').count()

tweet_id    324
jpg_url     324
img_num     324
p1          324
p1_conf     324
p1_dog      324
p2          324
p2_conf     324
p2_dog      324
p3          324
p3_conf     324
p3_dog      324
dtype: int64

In [24]:
# return number of duplicated rows
df_2.duplicated().sum()

0

In [170]:
# return number of missing values rows for each columns
df_2.isna().sum()

tweet_id    0
jpg_url     0
img_num     0
p1          0
p1_conf     0
p1_dog      0
p2          0
p2_conf     0
p2_dog      0
p3          0
p3_conf     0
p3_dog      0
dtype: int64

In [172]:
# generate statistics
df_2.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


### II. Reteweet and favorite table

In [46]:
# Display first few lines of retweet_and_favorite table
df_reteweet_and_favorite.head()

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,7106,34224
1,892177421306343426,5352,29678
2,891815181378084864,3523,22354
3,891689557279858688,7306,37403
4,891327558926688256,7851,35748


In [47]:
# display last few lines of retweet_and_favorite table
df_reteweet_and_favorite.tail()

Unnamed: 0,tweet_id,retweet_count,favorite_count
2323,666049248165822465,37,91
2324,666044226329800704,117,251
2325,666033412701032449,36,101
2326,666029285002620928,39,115
2327,666020888022790149,428,2308


In [27]:
# display info of retweet_and_favorite table
df_reteweet_and_favorite.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328 entries, 0 to 2327
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   tweet_id        2328 non-null   int64
 1   retweet_count   2328 non-null   int64
 2   favorite_count  2328 non-null   int64
dtypes: int64(3)
memory usage: 54.7 KB


In [29]:
# return number of missing values rows 
df_reteweet_and_favorite.duplicated().sum()

0

In [173]:
# return number of duplicated rows 
df_reteweet_and_favorite.duplicated().sum()

0

In [174]:
# generate statistics.
df_reteweet_and_favorite.describe()

Unnamed: 0,tweet_id,retweet_count,favorite_count
count,2328.0,2328.0,2328.0
mean,7.418403e+17,2504.904639,7141.517182
std,6.823143e+16,4242.311425,11095.857572
min,6.660209e+17,1.0,0.0
25%,6.781974e+17,503.0,1237.75
50%,7.180382e+17,1165.0,3086.5
75%,7.986673e+17,2898.0,8715.5
max,8.924206e+17,71778.0,146591.0


### III. Twitter archive table

In [1065]:
# display first few lines of data WeRateDogs
df_1.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [1066]:
# display last few lines of data WeRateDogs
df_1.tail(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,
2355,666020888022790149,,,2015-11-15 22:32:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a Japanese Irish Setter. Lost eye...,,,,https://twitter.com/dog_rates/status/666020888...,8,10,,,,,


In [330]:
# display info of data WeRateDogs
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [219]:
#return number of duplicated rows
df_1.duplicated().sum()

0

In [257]:
df_1[df_1["retweeted_status_user_id"].isnull()==False].duplicated(subset="retweeted_status_user_id").sum()

156

In [261]:
df_1[df_1["in_reply_to_status_id"].isnull()==False].duplicated(subset="in_reply_to_status_id").sum()

1

In [337]:
#return number of missing values for each columns
df_1.isna().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64

In [339]:
# source display url
df_1.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

In [697]:
# show first few dataframe with rating_denominator are not 10
df_1.query("rating_denominator!=10").head(3)


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
313,835246439529840640,8.35246e+17,26259576.0,2017-02-24 21:54:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@jonnysun @Lin_Manuel ok jomny I know you're e...,,,,,960,0,,,,,
342,832088576586297345,8.320875e+17,30582082.0,2017-02-16 04:45:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@docmisterio account started on 11/15/15,,,,,11,15,,,,,
433,820690176645140481,,,2017-01-15 17:52:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,,,,https://twitter.com/dog_rates/status/820690176...,84,70,,,,,


### Quality issues

<strong> A. Image prediction data table : </strong>

1. Many predictions contains three "False" in image prediction data, this is useless prediction data 

<strong> B. Retweet count and favorite count table : </strong> No quality issues

<strong> C. Twitter archive data table :</strong>

2. redundant retweets rows

3. redundant in_reply_to_users rows

3. Source displays url
   
4. Missing values (expanded_urls,in_reply_to_status_id, retweeted_status_id, etc..)

5. Wrong data types (timestamp)

6. Some rating_denominator are not 10

7. Incorrect dog names ("a" or "an") instead of None






### Tidiness issues
1. Multiples columns Dogs stages (doggo, floofer, pupper, and puppo). We only need one dog stage columns

2. retweet_count and favorite_count columns are not in the twitter archive dataframe

## Cleaning Data

In [1067]:
#  Make copies of original pieces of data
df_img_predict=df_2.copy() 
df_retweet_favorite= df_reteweet_and_favorite.copy()
df_twitter_archive=df_1.copy()

### Issue #1: There are not valid images predictions

### define 
Drop all rows with not valid images predictions who contains three "False" in image prediction data

### code

In [1069]:

index_img=list(df_img_predict.query('(p1_dog == False) & (p2_dog == False) & (p3_dog == False)').index)
df_img_predict.drop(axis=0, index=index_img, inplace=True)

### Test

In [1070]:
# confirm change
df_img_predict.query('(p1_dog == False) & (p2_dog == False) & (p3_dog == False)')

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog


### Issue #2:  redundant retweets rows,  in_reply_to_users and missing values 

### definie  
Find index and Remove the retweets and in reply rows. 
Drop all unnecessary columns with missing values

### code

In [1071]:
rep_index=df_1[df_1["in_reply_to_status_id"].isnull()==False].index
ret_index=df_twitter_archive[df_twitter_archive["retweeted_status_id"].isnull()==False].index

df_twitter_archive.drop(axis=0,index=ret_index, inplace=True)
df_twitter_archive.drop(axis=0,index=rep_index, inplace=True)

In [1072]:
columns=['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls']
df_twitter_archive.dropna(axis="columns",inplace=True)

### Test

In [1073]:
# confirm change
df_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_id            2097 non-null   int64 
 1   timestamp           2097 non-null   object
 2   source              2097 non-null   object
 3   text                2097 non-null   object
 4   rating_numerator    2097 non-null   int64 
 5   rating_denominator  2097 non-null   int64 
 6   name                2097 non-null   object
 7   doggo               2097 non-null   object
 8   floofer             2097 non-null   object
 9   pupper              2097 non-null   object
 10  puppo               2097 non-null   object
dtypes: int64(3), object(8)
memory usage: 196.6+ KB


### Issue #3:  Wrong data type

### define
Convert Timestamp column to datetime.

### code

In [1074]:
df_twitter_archive['timestamp']=pd.to_datetime(df_twitter_archive.timestamp)

### Test

In [1075]:
df_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2097 non-null   int64              
 1   timestamp           2097 non-null   datetime64[ns, UTC]
 2   source              2097 non-null   object             
 3   text                2097 non-null   object             
 4   rating_numerator    2097 non-null   int64              
 5   rating_denominator  2097 non-null   int64              
 6   name                2097 non-null   object             
 7   doggo               2097 non-null   object             
 8   floofer             2097 non-null   object             
 9   pupper              2097 non-null   object             
 10  puppo               2097 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(3), object(7)
memory usage: 196.6+ KB


### Issue #4: Dogs stages have multiples columns [doggo, floofer, pupper, and puppo]

#### Define:  Melt colums (doggo, floofer, pupper, and puppo) into one dog stage

### code and test

In [1076]:
df_twitter_archive['Stage'] = df_twitter_archive.doggo + df_twitter_archive.floofer + df_twitter_archive.pupper + df_twitter_archive.puppo
col=["doggo", "floofer", "pupper", "puppo"]
df_twitter_archive.drop(col,axis=1, inplace=True)


In [1077]:
df_twitter_archive.Stage.value_counts()

NoneNoneNoneNone        1761
NoneNonepupperNone       221
doggoNoneNoneNone         72
NoneNoneNonepuppo         23
NoneflooferNoneNone        9
doggoNonepupperNone        9
doggoNoneNonepuppo         1
doggoflooferNoneNone       1
Name: Stage, dtype: int64

In [1078]:
df_twitter_archive['Stage'] = df_twitter_archive['Stage'].apply(lambda x: x.replace("None",""))
df_twitter_archive.Stage.value_counts()

                1761
pupper           221
doggo             72
puppo             23
floofer            9
doggopupper        9
doggopuppo         1
doggofloofer       1
Name: Stage, dtype: int64

In [1079]:
for index in df_twitter_archive[df_twitter_archive.Stage.isin(["doggopupper"])].index:
    df_twitter_archive.Stage[index]="doggo,pupper"
    
for index in df_twitter_archive[df_twitter_archive.Stage.isin(["doggopuppo"])].index:
    df_twitter_archive.Stage[index]="doggo,puppo"   

for index in df_twitter_archive[df_twitter_archive.Stage.isin(["doggofloofer"])].index:
    df_twitter_archive.Stage[index]="doggo,floofer"
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_twitter_archive.Stage[index]="doggo,pupper"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_twitter_archive.Stage[index]="doggo,puppo"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_twitter_archive.Stage[index]="doggo,floofer"


In [1080]:
df_twitter_archive.Stage.value_counts()

                 1761
pupper            221
doggo              72
puppo              23
floofer             9
doggo,pupper        9
doggo,puppo         1
doggo,floofer       1
Name: Stage, dtype: int64

### Issue #5: Source display url

#### Define: 
Return source without url 

### code 

In [1081]:
df_twitter_archive["source"]=df_twitter_archive.source.apply(lambda x:x.split(">")[-2].split("<")[0])

### Test

In [1082]:
df_twitter_archive.head()

Unnamed: 0,tweet_id,timestamp,source,text,rating_numerator,rating_denominator,name,Stage
0,892420643555336193,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,13,10,Phineas,
1,892177421306343426,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,13,10,Tilly,
2,891815181378084864,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,12,10,Archie,
3,891689557279858688,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,13,10,Darla,
4,891327558926688256,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,12,10,Franklin,


### Issue #6:  Some rating_denominator are not 10

#### Define: 
Drop all rows with rating_denominator different by 10. Because the rating denominator is almost always have a denominator of 10  .

### code 

In [1083]:
deno_index=df_twitter_archive.query("rating_denominator!=10").index
df_twitter_archive.drop(axis=0,index=deno_index,inplace=True)


### Test 

In [1084]:
# confirm change
df_twitter_archive[df_twitter_archive["rating_denominator"]!=10]

Unnamed: 0,tweet_id,timestamp,source,text,rating_numerator,rating_denominator,name,Stage


### Issue #7: Incorrect dog names ("a" or "an") instead of None

### define

### code

In [1085]:
for index in df_twitter_archive[df_twitter_archive.name.isin(["an","a"])].index :
    df_twitter_archive.name[index]="None"
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_twitter_archive.name[index]="None"


### Test

In [1086]:
df_twitter_archive.query('name == "a" or name == "an"')

Unnamed: 0,tweet_id,timestamp,source,text,rating_numerator,rating_denominator,name,Stage


### Issue #8: retweet_count and favorite_count columns are not in the twitter archive dataframe

### define 
Merge retweet_favorite_count with twitter archive dataframe

### code

In [1087]:
df_tweet =df_retweet_favorite.merge(df_twitter_archive, left_on='tweet_id', right_on='tweet_id')

### test

In [1088]:
df_tweet.head()

Unnamed: 0,tweet_id,retweet_count,favorite_count,timestamp,source,text,rating_numerator,rating_denominator,name,Stage
0,892420643555336193,7106,34224,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,13,10,Phineas,
1,892177421306343426,5352,29678,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,13,10,Tilly,
2,891815181378084864,3523,22354,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,12,10,Archie,
3,891689557279858688,7306,37403,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,13,10,Darla,
4,891327558926688256,7851,35748,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,12,10,Franklin,


In [1089]:
df_tweet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2072 entries, 0 to 2071
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2072 non-null   int64              
 1   retweet_count       2072 non-null   int64              
 2   favorite_count      2072 non-null   int64              
 3   timestamp           2072 non-null   datetime64[ns, UTC]
 4   source              2072 non-null   object             
 5   text                2072 non-null   object             
 6   rating_numerator    2072 non-null   int64              
 7   rating_denominator  2072 non-null   int64              
 8   name                2072 non-null   object             
 9   Stage               2072 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(5), object(4)
memory usage: 178.1+ KB


## Storing Data


In [1090]:
# save dataframe to csv file
df_tweet.to_csv("twitter_archive_master.csv", index=False)
df_img_predict.to_csv("image_prediction_wrangled.csv", index=False)



In [1091]:
#Print a concise summary of a DataFrame.
df_tweet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2072 entries, 0 to 2071
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2072 non-null   int64              
 1   retweet_count       2072 non-null   int64              
 2   favorite_count      2072 non-null   int64              
 3   timestamp           2072 non-null   datetime64[ns, UTC]
 4   source              2072 non-null   object             
 5   text                2072 non-null   object             
 6   rating_numerator    2072 non-null   int64              
 7   rating_denominator  2072 non-null   int64              
 8   name                2072 non-null   object             
 9   Stage               2072 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(5), object(4)
memory usage: 178.1+ KB


In [1092]:
#Print a concise summary of a DataFrame.
df_img_predict.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1751 entries, 0 to 2073
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  1751 non-null   int64  
 1   jpg_url   1751 non-null   object 
 2   img_num   1751 non-null   int64  
 3   p1        1751 non-null   object 
 4   p1_conf   1751 non-null   float64
 5   p1_dog    1751 non-null   bool   
 6   p2        1751 non-null   object 
 7   p2_conf   1751 non-null   float64
 8   p2_dog    1751 non-null   bool   
 9   p3        1751 non-null   object 
 10  p3_conf   1751 non-null   float64
 11  p3_dog    1751 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 141.9+ KB


In [1103]:
df_twitter=pd.read_csv("twitter_archive_master.csv")

df_twitter.head()

Unnamed: 0,tweet_id,retweet_count,favorite_count,timestamp,source,text,rating_numerator,rating_denominator,name,Stage
0,892420643555336193,7106,34224,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,13,10,Phineas,
1,892177421306343426,5352,29678,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,13,10,Tilly,
2,891815181378084864,3523,22354,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,12,10,Archie,
3,891689557279858688,7306,37403,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,13,10,Darla,
4,891327558926688256,7851,35748,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,12,10,Franklin,


In [1105]:
df_img_pred=pd.read_csv("image_prediction_wrangled.csv")

df_img_pred.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
