# Get Twitter Data with Tweepy and Help from Pandas

## Libraries

In [217]:
import requests
import tweepy
import pandas as pd
import json

## Authentication input

Enter your own bearer token.

In [218]:
file = open("twitter_keys.json", 'r')
BEARER_TOKEN = json.load(file)["BEARER_TOKEN"]

## Client

In [220]:
client = tweepy.Client(
    bearer_token=BEARER_TOKEN,
    return_type = requests.Response # Needed to use json methods, which makes life much easier
)

## Get data with search_recent_tweets()
Available for essential (most basic and free version) access. By default, returns:
* only ten tweets (user `max_results` for more)
* text and author_id (called id)
* and max 7 days back

In [221]:
query = "data science -is:retweet"
response = client.search_recent_tweets(query,
                                       max_results=30,
                                       tweet_fields=["created_at","lang","public_metrics","geo","source","withheld"],
                                       expansions=["author_id","geo.place_id"] # This argument returns additional "includes" data compartment in response
                                       )
response

<Response [200]>

## json

In [222]:
response = response.json()
response

{'data': [{'id': '1531227544099635200',
   'created_at': '2022-05-30T10:54:30.000Z',
   'author_id': '1455547476941975554',
   'lang': 'en',
   'text': '📊#Datavisualization is one of the most powerful tools of #DataScience.\n👉Checkout this article to read more on all the #Data visualizations in data science with their implementation using #Python.\nhttps://t.co/rFSmCiD2QT',
   'source': 'Twitter Web App',
   'public_metrics': {'retweet_count': 0,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0}},
  {'id': '1531227432409346048',
   'created_at': '2022-05-30T10:54:03.000Z',
   'author_id': '448002010',
   'lang': 'en',
   'text': 'Made to measure: why we can’t stop quantifying our lives | Science | The Guardian https://t.co/oAjTPoLCB0 https://t.co/5TSO9afLRV',
   'source': 'Buffer',
   'public_metrics': {'retweet_count': 0,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0}},
  {'id': '1531227347227377664',
   'created_at': '2022-05-30T10:53:43.000Z',
   'au

## The split data issue

The API call returns the tweet data under two keys, "data" and "includes" (as well as meta data under the "meta" key). "data" has data on the tweets per se (text, language, date created), while "includes" has data on the user (name, username).

This means, that if we want to create one single dataframe, we have to create tweet_data and author_data first, and then merge them on their "author_id" column secondly.

### Tweet data

In [223]:
response["data"]

[{'id': '1531227544099635200',
  'created_at': '2022-05-30T10:54:30.000Z',
  'author_id': '1455547476941975554',
  'lang': 'en',
  'text': '📊#Datavisualization is one of the most powerful tools of #DataScience.\n👉Checkout this article to read more on all the #Data visualizations in data science with their implementation using #Python.\nhttps://t.co/rFSmCiD2QT',
  'source': 'Twitter Web App',
  'public_metrics': {'retweet_count': 0,
   'reply_count': 0,
   'like_count': 0,
   'quote_count': 0}},
 {'id': '1531227432409346048',
  'created_at': '2022-05-30T10:54:03.000Z',
  'author_id': '448002010',
  'lang': 'en',
  'text': 'Made to measure: why we can’t stop quantifying our lives | Science | The Guardian https://t.co/oAjTPoLCB0 https://t.co/5TSO9afLRV',
  'source': 'Buffer',
  'public_metrics': {'retweet_count': 0,
   'reply_count': 0,
   'like_count': 0,
   'quote_count': 0}},
 {'id': '1531227347227377664',
  'created_at': '2022-05-30T10:53:43.000Z',
  'author_id': '1249991751994990592'

In [224]:
tweets = pd.json_normalize(response["data"])
tweets

Unnamed: 0,id,created_at,author_id,lang,text,source,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,1531227544099635200,2022-05-30T10:54:30.000Z,1455547476941975554,en,📊#Datavisualization is one of the most powerfu...,Twitter Web App,0,0,0,0
1,1531227432409346048,2022-05-30T10:54:03.000Z,448002010,en,Made to measure: why we can’t stop quantifying...,Buffer,0,0,0,0
2,1531227347227377664,2022-05-30T10:53:43.000Z,1249991751994990592,en,Companies in Data Science domains require Rele...,Twitter Web App,1,0,0,0
3,1531227142666829824,2022-05-30T10:52:54.000Z,1517642926427451395,en,"print (""HELLO, WORLD!"")\nNOTAR Research &amp; ...",Twitter Web App,0,0,0,0
4,1531226962773032960,2022-05-30T10:52:11.000Z,704627962314891264,en,3 Reasons Why Teamwork is an Essential Skill i...,TwinyBots,1,0,0,0
5,1531226768627183617,2022-05-30T10:51:25.000Z,2374479512,en,Track A: Digital Display Spaces\nTrack B: Data...,Twitter Web App,0,1,0,0
6,1531226673932279808,2022-05-30T10:51:02.000Z,14847675,en,Design Thinking Humanizes #datascience\nThe #i...,Twitter Web App,1,0,0,0
7,1531226615723839488,2022-05-30T10:50:48.000Z,1478462793435205632,en,Monkeypox horror as outbreak 'differs by about...,Twitter for iPhone,0,0,0,0
8,1531226606768902144,2022-05-30T10:50:46.000Z,411753065,en,#PhDpositions AIMS-Rwanda .@NextEinsteinFor\n ...,Twitter Web App,1,0,1,0
9,1531226462959079424,2022-05-30T10:50:12.000Z,1526449163126607872,en,Data science is an emerging discipline. It aro...,Twitter Web App,0,0,0,0


### Author data

In [225]:
tweet_authors = pd.json_normalize(response["includes"]["users"])
tweet_authors

Unnamed: 0,id,name,username
0,1455547476941975554,Cosmos Thrace,CosmosThrace
1,448002010,Continuum Health,ContinuumPharma
2,1249991751994990592,Skillslash,skillslash
3,1517642926427451395,NOTAR,NotarAnalytics
4,704627962314891264,Necio,Necio_news
5,2374479512,iArtHis_Lab,iArtHislab
6,14847675,Yves Mulkers,YvesMulkers
7,1478462793435205632,KENNETH,KENNETH36674712
8,411753065,Alessandro Crimi 🧠🧬🔬🩺🧪,Dr_Alex_Crimi
9,1526449163126607872,darsh bhatt,darshbhatt11


## Merge tweet and author data

In [226]:
tweets_data = pd.merge(tweets,
                       tweet_authors,
                       left_on ="author_id",
                       right_on = "id",
                       suffixes = ("","_drop"), # Used to drop duplicate column
                       how="inner") # author id called id in tweets df
tweets_data.drop([col for col in tweets_data.columns if 'drop' in col], axis=1, inplace=True) # Drop duplicate author id column from tweet_authors

And here is the finished dataframe with all the data on tweets we have requested.

## Finished dataframe

In [242]:
tweets_data.head()

Unnamed: 0,id,created_at,author_id,lang,text,source,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,name,username
0,1531227544099635200,2022-05-30T10:54:30.000Z,1455547476941975554,en,📊#Datavisualization is one of the most powerfu...,Twitter Web App,0,0,0,0,Cosmos Thrace,CosmosThrace
1,1531227432409346048,2022-05-30T10:54:03.000Z,448002010,en,Made to measure: why we can’t stop quantifying...,Buffer,0,0,0,0,Continuum Health,ContinuumPharma
2,1531227347227377664,2022-05-30T10:53:43.000Z,1249991751994990592,en,Companies in Data Science domains require Rele...,Twitter Web App,1,0,0,0,Skillslash,skillslash
3,1531227142666829824,2022-05-30T10:52:54.000Z,1517642926427451395,en,"print (""HELLO, WORLD!"")\nNOTAR Research &amp; ...",Twitter Web App,0,0,0,0,NOTAR,NotarAnalytics
4,1531226962773032960,2022-05-30T10:52:11.000Z,704627962314891264,en,3 Reasons Why Teamwork is an Essential Skill i...,TwinyBots,1,0,0,0,Necio,Necio_news


In [244]:
tweets_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   id                            30 non-null     object
 1   created_at                    30 non-null     object
 2   author_id                     30 non-null     object
 3   lang                          30 non-null     object
 4   text                          30 non-null     object
 5   source                        30 non-null     object
 6   public_metrics.retweet_count  30 non-null     int64 
 7   public_metrics.reply_count    30 non-null     int64 
 8   public_metrics.like_count     30 non-null     int64 
 9   public_metrics.quote_count    30 non-null     int64 
 10  name                          30 non-null     object
 11  username                      30 non-null     object
dtypes: int64(4), object(8)
memory usage: 3.0+ KB


In [254]:
tweets_data.loc[0]

id                                                            1531227544099635200
created_at                                               2022-05-30T10:54:30.000Z
author_id                                                     1455547476941975554
lang                                                                           en
text                            📊#Datavisualization is one of the most powerfu...
source                                                            Twitter Web App
public_metrics.retweet_count                                                    0
public_metrics.reply_count                                                      0
public_metrics.like_count                                                       0
public_metrics.quote_count                                                      0
name                                                                Cosmos Thrace
username                                                             CosmosThrace
Name: 0, dtype: 

In [228]:
tweets_data.to_csv("recent_tweets_data.csv")