# Get Twitter Data with Tweepy and Help from Pandas

## Libraries

In [5]:
import requests
import tweepy
import pandas as pd
import json

## Authentication input

In [6]:
BEARER_TOKEN = "ENTER BEARER TOKEN"

## Client

In [7]:
client = tweepy.Client(
    bearer_token=BEARER_TOKEN,
    return_type = requests.Response # Needed to use json methods, which makes life much easier
)

## Get data with search_recent_tweets()
Available for essential (most basic and free version) access. By default, returns:
* only ten tweets (user `max_results` for more)
* text and author_id (called id)
* and max 7 days back

In [8]:
query = "data science -is:retweet"
response = client.search_recent_tweets(query,
                                       max_results=30,
                                       tweet_fields=["created_at","lang","public_metrics","geo","source","withheld"],
                                       expansions=["author_id","geo.place_id"] # This argument returns additional "includes" data compartment in response
                                       )
response

<Response [200]>

## json

In [9]:
response = response.json()

## The split data issue

The API call returns the tweet data under two keys, "data" and "includes" (as well as meta data under the "meta" key). "data" has data on the tweets per se (text, language, date created), while "includes" has data on the user (name, username).

This means, that if we want to create one single dataframe, we have to create tweet_data and author_data first, and then merge them on their "author_id" column secondly.

### Tweet data

In [10]:
tweets = pd.json_normalize(response["data"])
tweets.head()

Unnamed: 0,author_id,id,text,lang,created_at,source,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,geo.place_id
0,1441342299628138503,1531702405288665089,Cognitive technologies are bringing tremendous...,en,2022-05-31T18:21:25.000Z,Twitter Web App,0,0,1,0,
1,1059142262477725696,1531702047136956418,مع اني Data Science لكن مازلت اندهش من اللي يص...,ar,2022-05-31T18:20:00.000Z,Twitter for Android,0,0,0,0,000799c66e428a87
2,1343235115451887616,1531701964823830528,Never heard a concern from the Ontario Science...,en,2022-05-31T18:19:40.000Z,Twitter for Android,0,0,0,0,
3,727127737844928512,1531701835270172673,@ColtsGuy505 much appreciated! i am a computer...,en,2022-05-31T18:19:09.000Z,Twitter for iPhone,0,0,0,0,
4,14847675,1531701753388929032,10 Best Practices For #datascience\nFor quite ...,en,2022-05-31T18:18:50.000Z,Twitter Web App,2,0,0,0,


### Author data

In [11]:
tweet_authors = pd.json_normalize(response["includes"]["users"])
tweet_authors.head()

Unnamed: 0,id,name,username
0,1441342299628138503,StrategyOps Institute,StrategyOpsInst
1,1059142262477725696,Abdullah A. Alsharif,Abdullah_Wex
2,1343235115451887616,Riccardo,Jack71178929
3,727127737844928512,pete,kxngjames30
4,14847675,Yves Mulkers,YvesMulkers


## Merge tweet and author data

In [12]:
tweets_data = pd.merge(tweets,
                       tweet_authors,
                       left_on ="author_id",
                       right_on = "id",
                       suffixes = ("","_drop"), # Used to drop duplicate column
                       how="inner") # author id called id in tweets df
tweets_data.drop([col for col in tweets_data.columns if 'drop' in col], axis=1, inplace=True) # Drop duplicate author id column from tweet_authors

And here is the finished dataframe with all the data on tweets we have requested.

## Finished dataframe

In [13]:
tweets_data.head()

Unnamed: 0,author_id,id,text,lang,created_at,source,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,geo.place_id,name,username
0,1441342299628138503,1531702405288665089,Cognitive technologies are bringing tremendous...,en,2022-05-31T18:21:25.000Z,Twitter Web App,0,0,1,0,,StrategyOps Institute,StrategyOpsInst
1,1059142262477725696,1531702047136956418,مع اني Data Science لكن مازلت اندهش من اللي يص...,ar,2022-05-31T18:20:00.000Z,Twitter for Android,0,0,0,0,000799c66e428a87,Abdullah A. Alsharif,Abdullah_Wex
2,1343235115451887616,1531701964823830528,Never heard a concern from the Ontario Science...,en,2022-05-31T18:19:40.000Z,Twitter for Android,0,0,0,0,,Riccardo,Jack71178929
3,727127737844928512,1531701835270172673,@ColtsGuy505 much appreciated! i am a computer...,en,2022-05-31T18:19:09.000Z,Twitter for iPhone,0,0,0,0,,pete,kxngjames30
4,14847675,1531701753388929032,10 Best Practices For #datascience\nFor quite ...,en,2022-05-31T18:18:50.000Z,Twitter Web App,2,0,0,0,,Yves Mulkers,YvesMulkers


In [14]:
tweets_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 29
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   author_id                     30 non-null     object
 1   id                            30 non-null     object
 2   text                          30 non-null     object
 3   lang                          30 non-null     object
 4   created_at                    30 non-null     object
 5   source                        30 non-null     object
 6   public_metrics.retweet_count  30 non-null     int64 
 7   public_metrics.reply_count    30 non-null     int64 
 8   public_metrics.like_count     30 non-null     int64 
 9   public_metrics.quote_count    30 non-null     int64 
 10  geo.place_id                  1 non-null      object
 11  name                          30 non-null     object
 12  username                      30 non-null     object
dtypes: int64(4), object(9)

In [15]:
tweets_data.loc[0]

author_id                                                     1441342299628138503
id                                                            1531702405288665089
text                            Cognitive technologies are bringing tremendous...
lang                                                                           en
created_at                                               2022-05-31T18:21:25.000Z
source                                                            Twitter Web App
public_metrics.retweet_count                                                    0
public_metrics.reply_count                                                      0
public_metrics.like_count                                                       1
public_metrics.quote_count                                                      0
geo.place_id                                                                  NaN
name                                                        StrategyOps Institute
username        

In [16]:
tweets_data.to_csv("recent_tweets_data.csv")