# Find Tweets that DON'T have images

In [13]:
import requests
import pandas as pd
import json
import time

bearer_token = 'AAAAAAAAAAAAAAAAAAAAAGnaTwEAAAAAhRdM6yLmei6skyaWcjbx8IDFnlw%3DLPQHO2CTw1nVjjHLx3htgP9qmeCOgPpt96EdDujokNcWljI5iP'
headers = {'Authorization':('Bearer '+ bearer_token)}

n = 10                            # The total number of tweets we want
max_results = 10                  # The number of tweets to pull per request; must be between 10 and 100
total_retrieved = 0               # To keep track of when to stop
next_token = ""                   # Must be empty on first iteration
search_term = "manchester%20united"             # To form an advanced query, see here: https://twitter.com/search-advanced?lang=en
since_id = "1371600000000000000"  # The id of the oldest tweet you want to retrieve

# Create the empty DataFrame with the columns you want
df = pd.DataFrame(columns=['id', 'retweets', 'likes', 'text'])
df.set_index('id', inplace=True)

# stop when we have n results
while total_retrieved < n:

  # the first time through the loop, we do not need the next_token parameter
  if next_token == "":
    # url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&since_id={since_id}'
    url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}'
  else:
    # url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&since_id={since_id}&next_token={next_token}'
    url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&next_token={next_token}'

  # These are the extra parameters we will add to the querystring; we won't store them all though; just want you to see what's possible
  # url += f'&tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld'
  # url += f'&expansions=attachments.poll_ids,attachments.media_keys,author_id,geo.place_id,in_reply_to_user_id,referenced_tweets.id,entities.mentions.username,referenced_tweets.id.author_id'
  # url += f'&media.fields=duration_ms,height,media_key,preview_image_url,public_metrics,type,url,width'
  # url += f'&poll.fields=duration_minutes,end_datetime,id,options,voting_status'
  # url += f'&place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type'

  url += f'&user.fields=created_at,description,entities,id,location,name,profile_image_url,protected,public_metrics,url,username,verified,withheld'
  url += f'&tweet.fields=attachments,public_metrics,text,created_at'
  url += f'&expansions=attachments.media_keys'
  url += f'&media.fields=media_key,type,url'

  # make the request to the Twitter API Recent Search endpoint
  response = requests.request("GET", url, headers=headers)
  try:  # Just in case we get an error
    json_data = json.loads(response.text)
    print(json_data)
  except:
    print(response.text)

  for tweet in json_data['data']:
    media_key = ""  # Reset to empty each time through the loop so that we can use it for a condition later

    # Store the data into variables
    tweet_id = tweet['id']
    retweet_count = tweet['public_metrics']['retweet_count']
    like_count = tweet['public_metrics']['like_count']
    image_url = ""
    text = tweet['text']


    # Find out if there is media
    if 'attachments' in tweet:
      if 'media_keys' in tweet['attachments']:
        media_key = tweet['attachments']['media_keys'][0]

    # If there is a media key in this tweet, iterate through tweet['includes']['media'] until we find it
    if media_key != "":
      for media in json_data['includes']['media']:
        if media['media_key'] == media_key: # Only if the media_key matches the one we stored
          if media['type'] == 'photo':      # Only if it is a photo; ignore videos
            image_url = media['url']        # Store the url in a variable

  # keep track of how many results have been obtained so far:
  total_retrieved += 1
  
  # Add the new data to a new record in the DataFrame
  df.loc[tweet_id] = [retweet_count, like_count, text]

  # keep track of where to start next time, but quit if there are no more results
  try:
    next_token = json_data['meta']['next_token']
  except:
    break  

  #sleep to avoid hitting the rate limit
  print('sleeping')
  time.sleep(1)

print(f'Number of records:\t{len(df)}')
df.to_csv('twitter.csv')
df.head()  

{'data': [{'text': 'RT @GuiaFutebolPlus: 🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f Lingard e Cristiano Ronaldo (Manchester United) https://t.co/iNioKQpgko', 'public_metrics': {'retweet_count': 1, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'attachments': {'media_keys': ['3_1439701381049430016']}, 'created_at': '2021-09-23T23:22:07.000Z', 'id': '1441181109665116160'}, {'text': 'RT @ConexionWIN: “No creo que sea feliz un hombre que, a sus 30 años, hoy podría jugar en el Real Madrid, Manchester United o City, porque…', 'public_metrics': {'retweet_count': 2, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'created_at': '2021-09-23T23:22:04.000Z', 'id': '1441181100341084162'}, {'text': "RT @brfootball: Manchester United's run-in during October and November 😳 https://t.co/4qBtx2BBIH", 'public_metrics': {'retweet_count': 1749, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'attachments': {'media_keys': ['3_1441128872116768771']}, 'created_at': '2021-

Unnamed: 0_level_0,retweets,likes,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1441181040710615054,1749,0,RT @brfootball: Manchester United's run-in dur...
1441180867859206144,1750,0,RT @brfootball: Manchester United's run-in dur...
1441180689945092103,2138,0,RT @FootballlForAll: 🚨⚽️ | How many Days it’s ...
1441180374663671808,44,0,"RT @Jimrose001: Manchester United, aka ""Trust ..."
1441180236075331592,2457,0,RT @footballdaily: 🕓 Days since PL clubs last...


# Find tweets that DO have images

In [24]:
# In this example, only those tweets with photos/images are stored

n = 20                           # The total number of tweets we want
max_results = 10                 # The number of tweets to pull per request; must be between 10 and 100
total_retrieved = 0               # To keep track of when to stop
next_token = ""                   # Must be empty on first iteration
search_term = "manchester%20united"             # To form an advanced query, see here: https://twitter.com/search-advanced?lang=en
since_id = "1371590000000000000"  # The id of the oldest tweet you want to retrieve

# Create the empty DataFrame with the columns you want
df_img = pd.DataFrame(columns=['id', 'retweets', 'likes', 'url', 'text'])
df_img.set_index('id', inplace=True)

# stop when we have n results
while total_retrieved < n:

  # the first time through the loop, we do not need the next_token parameter
  if next_token == "":
    # url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&since_id={since_id}'
    url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}'
  else:
    # url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&since_id={since_id}&next_token={next_token}'
    url = f'https://api.twitter.com/2/tweets/search/recent?query={search_term}&max_results={max_results}&next_token={next_token}'

  # These are the extra parameters we will add to the querystring; we won't store them all though; just want you to see what's possible
  url += f'&tweet.fields=attachments,public_metrics,text'
  url += f'&expansions=attachments.media_keys'
  url += f'&media.fields=media_key,type,url'

  # make the request to the Twitter API Recent Search endpoint
  response = requests.request("GET", url, headers=headers)
  try:  # Just in case we get an error
    json_data = json.loads(response.text)
  except:
    print(response.text)
  

  for tweet in json_data['data']:
    media_key = ""  # Reset to empty each time through the loop so that we can use it for a condition later

    # Store the data into variables
    tweet_id = tweet['id']
    retweet_count = tweet['public_metrics']['retweet_count']
    like_count = tweet['public_metrics']['like_count']
    image_url = ""
    text = tweet['text']

    # Find out if there is media
    if 'attachments' in tweet:
      if 'media_keys' in tweet['attachments']:
        media_key = tweet['attachments']['media_keys'][0]

    # If there is a media key in this tweet, iterate through tweet['includes']['media'] until we find it
    if media_key != "":
      for media in json_data['includes']['media']:
        if media['media_key'] == media_key: # Only if the media_key matches the one we stored
          if media['type'] == 'photo':      # Only if it is a photo; ignore videos
            image_url = media['url']        # Store the url in a variable
            
            # Only iterate if a photo is found
            total_retrieved += 1
            
            # Only add the record in the DataFrame if a photo is found
            df_img.loc[tweet_id] = [retweet_count, like_count, image_url, text]
            break

  # keep track of where to start next time, but quit if there are no more results
  try:
    next_token = json_data['meta']['next_token']
  except:
    break  

print(f'Number of records:\t{len(df_img)}')
# df_img.to_csv('twitter.csv')
df_img.head()  

Number of records:	20


Unnamed: 0_level_0,retweets,likes,url,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1441173167326654474,3,0,https://pbs.twimg.com/media/FAAOzkfWUAA5z6J.jpg,RT @BlitzVideos: 'A Football Life: James Harri...
1441173151258275841,2,0,https://pbs.twimg.com/media/FAATI9hXEAcpOm0.jpg,RT @TLH_Updates: In sub varsity football actio...
1441173139111616524,0,0,https://pbs.twimg.com/media/FAATVSfVEAIyljk.jpg,PL predictions: Back a Man Utd penalty vs Vill...
1441173136808943621,0,0,https://pbs.twimg.com/media/FAATVDMVEAkIMYk.jpg,@InThe700 @kefferwill1 @Tweets_By_Zo @TyreseMa...
1441173128046997505,0,0,https://pbs.twimg.com/media/FAATUoRVUAMLUk6.jpg,Jesse Lingard dives and screams more at Manche...
