## Download Images from tweet ids

This notebook aims to download images using the api, based on the tweets ids that were collected with DMI TCAT

In [1]:
import requests
import pandas as pd
import json
import os
import tqdm
import time
import hashlib
import concurrent

In [2]:
params_search = {
    "tweet.fields": "public_metrics,referenced_tweets,possibly_sensitive,created_at,source,reply_settings,withheld",
    "expansions": "author_id,in_reply_to_user_id,attachments.media_keys",
    "media.fields": "url,public_metrics,type,alt_text",
    "user.fields": "id,verified,name"
}

In [56]:
ROOT_FOLDER = "/home/tyra/Documents/CERES/resources"
OUTPUT_FOLDER = "/home/tyra/Documents/CERES/PMA/MPT"
CREDENTIALS_FILES = r"/home/tyra/Documents/CERES/credentials_pro.json"


In [57]:
def generate_token():
    with open(CREDENTIALS_FILES, 'r') as f:
        return f"Bearer {json.load(f)['token']}"

In [58]:
s = requests.Session()
s.headers.update({"Authorization": generate_token()})

In [89]:
# open the json containing all the ids that we want to fetch
with open(os.path.join(ROOT_FOLDER, 'pma.json'), 'r') as f:
    ids = json.load(f)
    ids = [str(i) for i in ids]
len(ids)

131071

In [85]:
def download_media(media_key=None, url=None, **kwargs):
    if not media_key or not url:
        raise ValueError("Missing field when trying to save media")
    file_type = url.split('.')[-1]
    sha1 = None
    # download the file
    try:
        res = requests.get(url)
    except requests.RequestException:
        raise ValueError(f"There was an error when downloading the media with following url: {url}, please check your connection or url")
    
    # calculate signature of content, if this signature already exists then just increment the number of 
    buffer = res.content
    signature = hashlib.sha1(buffer).hexdigest()
    file_name = f"{signature}.{file_type}"

    with open(os.path.join(OUTPUT_FOLDER, 'media', file_name), 'wb') as f:
        f.write(res.content)
        
    with open(os.path.join(OUTPUT_FOLDER, 'sha1.json'), 'r') as f:
        sha1 = json.load(f)
    
    sha1[media_key] = file_name
    
    with open(os.path.join(OUTPUT_FOLDER, 'sha1.json'), 'w') as f:
        json.dump(sha1, f)
    

Save all the tweets that were already fetched so in case of error, we can just continue from where we stopped.

In [86]:
try:
    with open(os.path.join(OUTPUT_FOLDER, 'fetched.json'), 'r') as f:
        fetched = json.load(f)
        already_parsed = len(fetched)
        nb_calls = round((len(ids) - already_parsed) / 100)
except:
    fetched = []
    already_parsed = 0
    nb_calls = round((len(ids) - already_parsed) / 100)

already_parsed

70200

In [90]:
for occ in tqdm.tqdm(range(nb_calls)):
        
    # fetch ids 100 by 100
    ids_to_fetch = ids[already_parsed + occ * 100: already_parsed + occ * 100 + 100]    
    res = s.get('https://api.twitter.com/2/tweets?ids=' + ','.join(ids_to_fetch), params=params_search)
    
    # we made too many calls, lets wait 15min
    if 'data' not in res.json():
        print(res.json())
        print('making a break')
        time.sleep(900)
        res = s.get('https://api.twitter.com/2/tweets?ids=' + ','.join(ids_to_fetch), params=params_search)
    
    # get results
    tweets = res.json()['data']
    media = res.json()['includes'].get('media', None)
    for tweet in tweets:
        # save all medium info in the tweet json
        for index, key in enumerate(tweet.get('attachments', {}).get('media_keys', [])):
            for medium in media:
                if medium['media_key'] == key:
                    tweet['attachments']['media_keys'][index] = medium
                    break
        try:
            with open(os.path.join(OUTPUT_FOLDER, f"{tweet['id']}.json"), 'w') as f:
                json.dump(tweet, f)
        except Exception as e:
            print(e)
        
#         # get the origin tweet if tweet is a retweet --> update that's too much requests
#         for rtweet in tweet.get('referenced_tweets', []):
#             if rtweet['type'] == 'retweeted' and rtweet['id'] not in fetched:
#                 res = s.get(f"https://api.twitter.com/2/tweets/{rtweet['id']}", params=params_search)
#                 if 'data' not in res.json():
#                     print(res.json())
#                 with open(os.path.join(OUTPUT_FOLDER, f"{rtweet['id']}.json"), 'w') as f:
#                     json.dump(res.json()['data'], f)
#                 fetched.append(rtweet['id'])
#     if media:
#         for medium in media:
#             # check if all media were properly downloaded first time
#             in_sha_1 = medium['media_key'] in sha1
#             if not in_sha_1:
#                 missing_media.append(medium)
#                 with open(os.path.join(OUTPUT_FOLDER, 'missing_media.json'), 'w') as f:
#                     json.dump(missing_media, f)
    if media:
        for medium in media:
            if medium['type'] == 'photo':
                download_media(medium['media_key'], medium['url'])
                
    # write fetch only if everything was written
    fetched = [*fetched, *ids_to_fetch]
    with open(os.path.join(OUTPUT_FOLDER, 'fetched.json'), 'w') as f:
        json.dump(fetched, f)

 81%|█████████████████████████████████▏       | 493/609 [14:55<02:50,  1.47s/it]

{'title': 'Too Many Requests', 'detail': 'Too Many Requests', 'type': 'about:blank', 'status': 429}
making a break


100%|█████████████████████████████████████████| 609/609 [34:06<00:00,  3.36s/it]


In [60]:
missing_videos = [m for m in missing_media if m['type'] == 'video']
missing_gif = [m for m in missing_media if m['type'] == 'animated_gif']
missing_photos = [m for m in missing_media if m['type'] == 'photo']
print(len(missing_videos), len(missing_gif), len(missing_photos))

1652 443 90
