In [1]:
import numpy as np
import pandas as pd
import tweepy
%matplotlib inline
from matplotlib import pyplot as plt
import requests
import os 
import timeit
import time
import json

# Gathering Data

- [x] Download twitter_archive_enhanced.csv
- [x] Download image_predictions.tsv
- [x] Download tweet_json.txt

## tasks
> Download file from [link ](https://d17h27t6h515a5.cloudfront.net/topher/2017/August/59a4e958_twitter-archive-enhanced/twitter-archive-enhanced.csv) using requests
> save the file into the root derectory using os.
>
>> check whether the file exists.
>
> Download file from [link](https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv)
>> check whether the file exists.

> Query Twitter API for each tweet's JSON data.

>> 1. store each tweet's entire set of JSON data in a file called tweet_json.txt.

>> 2. each tweet's JSON data should be written to its own line.

>> 3. Then read this .txt file line by line into a pandas DataFrame with tweet ID, retweet count and favourite count.

In [2]:
def gather_data(url, file_name):
    if os.path.exists(file_name):
        return "Already exist"
    else:
        r = requests.get(url)
        with open(file_name, 'wb') as file:
            file.write(r.content)
        return 'downloaded'
twitter_archive_enhanced_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/59a4e958_twitter-archive-enhanced/twitter-archive-enhanced.csv'
gather_data(twitter_archive_enhanced_url, 'twitter-archive-enhanced.csv')

'Already exist'

In [3]:
image_predictions_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
gather_data(image_predictions_url, 'image_predictions.tsv')

'Already exist'

In [4]:
twitter_archive_enhanced = pd.read_csv('twitter-archive-enhanced.csv')

In [5]:
image_predictions = pd.read_csv('image_predictions.tsv', delimiter='\t')

In [6]:
tweet_id = '851953902622658560'

### Try the API extract json

### Count the retweet and like of every tweet id.

### store every tweet id's json as one entity in json.txt


>> 1. Extract ids from tweet data file and prediction file.
        df.id, tweepy.API.get_status
>> 2. save the into tweet_json.txt 
        json.dump

### test 

In [7]:
# read tweet_archive_file and image_prediction file
tweet_archive_f = pd.read_csv('twitter-archive-enhanced.csv')
prediction_f = pd.read_csv('image_predictions.tsv', delimiter='\t')

In [8]:
# check if the tsv file read succefully
prediction_f.sample()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1010,709409458133323776,https://pbs.twimg.com/media/CdhUIMSUIAA4wYK.jpg,1,Shetland_sheepdog,0.79745,True,collie,0.054055,True,keeshond,0.031673,True


In [9]:
# collect ids from above two tables.
tweet_ids = pd.Series(list(tweet_archive_f.tweet_id) + list(prediction_f.tweet_id))

In [10]:
tweet_ids.sample(2)

2870    676237365392908289
351     831322785565769729
dtype: int64

In [11]:
len(tweet_ids)

4431

There is 4431 ids.

In [12]:
tweet_ids.count()

4431

None of them are null

In [78]:
consumer_key = 'secret'
consumer_secret = 'secret'
access_token = 'secret'
access_secret = 'secret'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) # Set rate limit to avoid errors.

def download_tweets_json(tweetid):
    '''Download tweet using tweet's ids json file.
    
    key arguments
    tweetid -- the tweetid which you wanna download.
    '''
    # start_time = time.process_time()
    #if os.path.exists('json.txt'):
    #    return "json.txt exists"
    try:
        status = api.get_status(tweetid, tweet_mode='extended') # 获取id的所有状态信息.
        return json.dumps(status._json) # 把status自带的json转换为json第三方包可读的json.类型是str.
    except tweepy.TweepError as e:
        print(tweet_id+'disappeared')
        pass
    # print('%d used'%(time.process_time() - start_time))
    print('tweet ID: %s'%(tweetid)) # Print ID after required

# Assess Data

Assess there data sets and note at least 8 quality problems and 2 tideness problem.

1. you only want ratings(no retweets) that have images. Not all the tweets are about dogs and some are retweets.
2. 8 quality issues and 2 tideness issues in the dataset.
3. Merging individual pieces of data according to the rules of tidy data.
4. Numertators are greater than the denominators.(this is normal)
5. Do not gather the tweets beyond August 1st, 2017.

# 数据整洁性问题
1. 数据完整性.
2. 数据可用性.
3. 数据合理性.
4. 数据一致性.

## 表1: twitter_archive_enhanced

In [16]:
twitter_archive_enhanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

### 完整性问题(变量空值问题)
1. in_reply_to_status_id
2. in_reply_to_user_id
3. retweeted_stated_id
4. retweeted_status_user_id
5. expanded_urls

### 数据可用性问题
1. id类,name,text应该是字符串
2. source需要把网址提取出来,iPhone,Twitter,vine,tweetdeck.
3. 小狗种类应该是布林值.

## 数据整洁性问题
1. 每一列是一个变量.
2. 每一行是一个样本.
3. 每一个表是一个观察角度.

### 每一列是一个变量.
1. 转发的三列包含了两个意思, 第一个是是否转发, 第二个才是转发的具体信息, 所以应该有一列是布林值的retweet.
2. 同理,回复列的in_reply_to的两个变量也是, id是value,而列的名字应该是变量名, 所以有一列应该是布林值的replay.
3. 狗狗的成长阶段是value,应该作为stage的值,所以应该新建一个stage的列,把值填进去.

### 每一个表是一个观察角度.
1. 回复类的和转发类的应该是两个单独的表.

## 表2: image_predictions

In [17]:
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


### 质量问题
1. tweet_id 应该是string.
2. jpg_url 应该是string.

# 表3 json.txt

 read this .txt file line by line into a pandas DataFrame with (at minimum) tweet ID, retweet count, and favorite count. Note: do not include your Twitter API keys, secrets, and tokens in your project submission.

### BUG
1. 我把每一个id读出来的json放在一行, 但是现在的问题是, json已经无法识别这个json文件.现在变成了一个string.

思路: 重新写Twitter API的获取json文件的代码.每一行保存成json object而不是str, 或者利用json的库来实现将json转换为str.

In [18]:
keys = ['id', 'retweet_count', 'favorite_count']

### Gather data from tweeter API
需求: 我现在需要做的是, 写出一个函数, 输入json.txt, 输出是一个dataframe, 变量为 id, retweet_count, favourite_couint.

define
> 使用df完成所有的操作.

1. 读取json.txt文件为Series.
2. 对于每一条读取为json object.
3. 对于每个json object提取需要的key放入字典

In [23]:
# 读取json.txt
json_original = pd.read_csv('json.txt', header=None)

In [28]:
json_original.sample(5)

Unnamed: 0,0
369,"{""created_at"": ""Mon Feb 06 17:02:17 +0000 2017..."
523,"{""created_at"": ""Thu Dec 15 17:23:04 +0000 2016..."
1079,"{""created_at"": ""Sat Jun 04 23:31:25 +0000 2016..."
2391,"{""created_at"": ""Tue Nov 17 02:06:42 +0000 2015..."
614,"{""created_at"": ""Thu Nov 10 17:02:03 +0000 2016..."


In [34]:
json.loads(json_original[0][0])['id']

892420643555336193

In [48]:
dict(zip(keys, [1, 2, 3]))

{'id': 1, 'retweet_count': 2, 'favorite_count': 3}

In [57]:
one_row = pd.DataFrame(dict(zip(keys, [1, 2, 3])), index=[0])

In [58]:
one_row.append(one_row)

Unnamed: 0,favorite_count,id,retweet_count
0,3,1,2
0,3,1,2


In [61]:
[json.loads(json_original[0][0])[key] for key in keys]

[892420643555336193, 8643, 38966]

In [62]:
pd.DataFrame(columns=keys)

Unnamed: 0,id,retweet_count,favorite_count


In [72]:
%pdb

Automatic pdb calling has been turned OFF


In [77]:
def create_df_from_json(file_name, keys):
    """
    返回一个拥有特定keys的dataframe
    -----
    参数: 
    -----
    file_name: 字符串, 文件地址
    keys: 需要提取的键值
    """
    json_original = pd.read_csv(file_name, header=None) # 读取json文件
    df = pd.DataFrame(columns=keys)
    # 提取keys并作为一行dataframe
    index = 0
    for json_object in json_original[0]:
        try:
            json_dict = json.loads(json_object) # 读取成json的dictionary.
        except TypeError:
            print(index, "failes") # 有些空值的文件读取不了就算了.
        make_dict = dict(zip(keys, [json_dict[key] for key in keys]))
        df = df.append(pd.DataFrame(make_dict, index=[index]))
        index += 1
    return df

In [76]:
create_df_from_json('json.txt', keys)

19 failes
95 failes
118 failes
132 failes
155 failes
247 failes
260 failes
298 failes
382 failes
566 failes
784 failes
3971 failes
4191 failes
4220 failes
4300 failes
4348 failes
4411 failes


Unnamed: 0,favorite_count,id,retweet_count
0,38966,892420643555336193,8643
1,33357,892177421306343426,6351
2,25142,891815181378084864,4213
3,42309,891689557279858688,8761
4,40474,891327558926688256,9522
5,20294,891087950875897856,3155
6,11901,890971913173991426,2105
7,65829,890729181411237888,19159
8,27879,890609185150312448,4321
9,32073,890240255349198849,7525
