# Get Twitter Data with Tweepy and Help from Pandas

## Libraries

In [10]:
import requests
import tweepy
import pandas as pd
import json

## Authentication input

In [11]:
BEARER_TOKEN = "ENTER BEARER TOKEN"

## Client

In [13]:
client = tweepy.Client(
    bearer_token=BEARER_TOKEN,
    return_type = requests.Response # Needed to use json methods, which makes life much easier
)

## Get data with search_recent_tweets()
Available for essential (most basic and free version) access. By default, returns:
* only ten tweets (user `max_results` for more)
* text and author_id (called id)
* and max 7 days back

In [14]:
query = "data science -is:retweet"
response = client.search_recent_tweets(query,
                                       max_results=30,
                                       tweet_fields=["created_at","lang","public_metrics","geo","source","withheld"],
                                       expansions=["author_id","geo.place_id"] # This argument returns additional "includes" data compartment in response
                                       )
response

<Response [200]>

## json

In [15]:
response = response.json()

## The split data issue

The API call returns the tweet data under two keys, "data" and "includes" (as well as meta data under the "meta" key). "data" has data on the tweets per se (text, language, date created), while "includes" has data on the user (name, username).

This means, that if we want to create one single dataframe, we have to create tweet_data and author_data first, and then merge them on their "author_id" column secondly.

### Tweet data

In [16]:
tweets = pd.json_normalize(response["data"])
tweets.head()

Unnamed: 0,id,created_at,author_id,lang,text,source,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,1531238119659515906,2022-05-30T11:36:31.000Z,4807561,en,"As data science goes mainstream, so does its l...",Twitter Web App,0,0,0,0
1,1531237891946467328,2022-05-30T11:35:37.000Z,1529750009222205446,en,#mufc feel the appointment of Dominic Jordan a...,Twitter for iPhone,0,0,0,0
2,1531237804935618563,2022-05-30T11:35:16.000Z,1471439196816658435,en,.\nTransforming Environmental Data in R @UK_C...,Twitter Web App,0,0,0,0
3,1531237784484290561,2022-05-30T11:35:11.000Z,69196843,en,@ring_sec @Buffyinnyc Well ignore all the scie...,Twitter for iPhone,0,0,0,0
4,1531237724065513472,2022-05-30T11:34:57.000Z,1222239337548435456,en,[🌕] #mufc feel the appointment of Dominic Jord...,Twitter for iPhone,0,0,15,0


### Author data

In [17]:
tweet_authors = pd.json_normalize(response["includes"]["users"])
tweet_authors.head()

Unnamed: 0,id,name,username
0,4807561,William Toll,utollwi
1,1529750009222205446,MUFC! 🔴👀,1NewsMufc
2,1471439196816658435,UKCEH training courses,UKCEH_training
3,69196843,"PT - get vaccinated! Covid killed 1,000,000 so...",pt35mm
4,1222239337548435456,centredevils.,centredevils


## Merge tweet and author data

In [18]:
tweets_data = pd.merge(tweets,
                       tweet_authors,
                       left_on ="author_id",
                       right_on = "id",
                       suffixes = ("","_drop"), # Used to drop duplicate column
                       how="inner") # author id called id in tweets df
tweets_data.drop([col for col in tweets_data.columns if 'drop' in col], axis=1, inplace=True) # Drop duplicate author id column from tweet_authors

And here is the finished dataframe with all the data on tweets we have requested.

## Finished dataframe

In [19]:
tweets_data.head()

Unnamed: 0,id,created_at,author_id,lang,text,source,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,name,username
0,1531238119659515906,2022-05-30T11:36:31.000Z,4807561,en,"As data science goes mainstream, so does its l...",Twitter Web App,0,0,0,0,William Toll,utollwi
1,1531237891946467328,2022-05-30T11:35:37.000Z,1529750009222205446,en,#mufc feel the appointment of Dominic Jordan a...,Twitter for iPhone,0,0,0,0,MUFC! 🔴👀,1NewsMufc
2,1531237804935618563,2022-05-30T11:35:16.000Z,1471439196816658435,en,.\nTransforming Environmental Data in R @UK_C...,Twitter Web App,0,0,0,0,UKCEH training courses,UKCEH_training
3,1531237784484290561,2022-05-30T11:35:11.000Z,69196843,en,@ring_sec @Buffyinnyc Well ignore all the scie...,Twitter for iPhone,0,0,0,0,"PT - get vaccinated! Covid killed 1,000,000 so...",pt35mm
4,1531237724065513472,2022-05-30T11:34:57.000Z,1222239337548435456,en,[🌕] #mufc feel the appointment of Dominic Jord...,Twitter for iPhone,0,0,15,0,centredevils.,centredevils


In [20]:
tweets_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   id                            30 non-null     object
 1   created_at                    30 non-null     object
 2   author_id                     30 non-null     object
 3   lang                          30 non-null     object
 4   text                          30 non-null     object
 5   source                        30 non-null     object
 6   public_metrics.retweet_count  30 non-null     int64 
 7   public_metrics.reply_count    30 non-null     int64 
 8   public_metrics.like_count     30 non-null     int64 
 9   public_metrics.quote_count    30 non-null     int64 
 10  name                          30 non-null     object
 11  username                      30 non-null     object
dtypes: int64(4), object(8)
memory usage: 3.0+ KB


In [21]:
tweets_data.loc[0]

id                                                            1531238119659515906
created_at                                               2022-05-30T11:36:31.000Z
author_id                                                                 4807561
lang                                                                           en
text                            As data science goes mainstream, so does its l...
source                                                            Twitter Web App
public_metrics.retweet_count                                                    0
public_metrics.reply_count                                                      0
public_metrics.like_count                                                       0
public_metrics.quote_count                                                      0
name                                                                 William Toll
username                                                                  utollwi
Name: 0, dtype: 

In [22]:
tweets_data.to_csv("recent_tweets_data.csv")