### Imports

In [2]:
from twarc.client2 import Twarc2
from twarc.expansions import ensure_flattened
from tqdm import tqdm
import configparser
import datetime
import time
import pandas as pd

### Get API tokens

In [3]:
# Create a parser object and disable interpolation
parser = configparser.ConfigParser(interpolation=None)

# Read data from 'config.ini' file
parser.read("../config.ini")

# Access sections from the configuration file
parser.sections()

# Get 'bearer_token' from twitter section
bearer_token = parser.get('twitter', 'bearer_token')

# Create a Twarc2 instance with twitter credentials
client = Twarc2(bearer_token = bearer_token)

### Search with `twarc2`

In [4]:
"""
Source code from: https://twarc-project.readthedocs.io/en/latest/api/library/
"""
# List of usernames
usernames = ['@danielapastrana', '@penileyramirez']

keywords = []

# Create an empty list to store data
tweets_list = []

# Iterate over our target users
for username in usernames:

    # Search for any tweets matching the query, Twitter API uses a boolean AND by default
    query = f"{username} lang:pt -is:retweet"

    # Specify the start time in UTC for the time period you want tweets fro
    start_time = datetime.datetime(2022, 1, 1, 0, 0, 0, 0, datetime.timezone.utc)
    # Specify the end time in UTC for the time period you want tweets from
    end_time = datetime.datetime(2022, 10, 31, 23, 59, 0, 0, datetime.timezone.utc)

    # 'search_results' is a generator, 'max_results' is max tweets per page, 100 max for full archive search with all expansions
    search_results = client.search_all(query=query, start_time=start_time, end_time=end_time, max_results=100)

    # Get all results page by page:
    for page in search_results:
        # Flatten results returning 1 tweet at a time, with expansions inline:
        for tweet in tqdm(ensure_flattened(page)):
            # Append tweets to empty list
            tweets_list.append(tweet)
            # time.sleep(5)
        # Get only one page for testing purposes
        break
    print(f'Collected tweets mentioning {username}')

100%|██████████| 73/73 [00:00<00:00, 699050.67it/s]


Collected tweets mentioning @danielapastrana


100%|██████████| 100/100 [00:00<00:00, 1067252.93it/s]

Collected tweets mentioning @penileyramirez





### Process data

In [6]:
# TODO: Avoid repeated column names 

# Convert json data to pandas dataframe
df = pd.DataFrame(tweets_list)
df

Unnamed: 0,in_reply_to_user_id,author_id,referenced_tweets,edit_controls,created_at,edit_history_tweet_ids,id,conversation_id,entities,reply_settings,text,public_metrics,lang,possibly_sensitive,author,in_reply_to_user,__twarc,attachments,geo,context_annotations
0,83683902,256545042,"[{'type': 'replied_to', 'id': '158458001480825...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",2022-10-26T12:17:08.000Z,[1585244134344122370],1585244134344122370,1584580014808256519,"{'mentions': [{'start': 0, 'end': 13, 'usernam...",everyone,@felyxmarquez @AlJazeera @KaviChek @johnholman...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pt,False,"{'verified': False, 'profile_image_url': 'http...","{'url': 'https://t.co/YG16pMV2mO', 'verified':...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,
1,3087410276,381644638,"[{'type': 'replied_to', 'id': '158450019773583...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",2022-10-24T20:40:55.000Z,[1584646137910206464],1584646137910206464,1584500197735833602,"{'mentions': [{'start': 0, 'end': 9, 'username...",everyone,@PdPagina @Duiliorodriguez @danielapastrana Br...,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",pt,False,"{'url': 'https://t.co/rLAsTCxqHe', 'verified':...","{'url': 'https://t.co/QaPDGxxwyk', 'verified':...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,
2,,123151858,"[{'type': 'quoted', 'id': '1583765682214252544...","{'edits_remaining': 5, 'is_edit_eligible': Tru...",2022-10-23T03:08:03.000Z,[1584018787979825153],1584018787979825153,1584018787979825153,"{'urls': [{'start': 31, 'end': 54, 'url': 'htt...",everyone,"Mira, @danielapastrana !!! 😍😍😍 https://t.co/bH...","{'retweet_count': 1, 'reply_count': 1, 'like_c...",es,False,"{'verified': False, 'profile_image_url': 'http...",,{'url': 'https://api.twitter.com/2/tweets/sear...,,,
3,90663539,1674341816,"[{'type': 'replied_to', 'id': '158370926257584...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",2022-10-22T17:30:26.000Z,[1583873428309700610],1583873428309700610,1583709262575849473,"{'mentions': [{'start': 0, 'end': 16, 'usernam...",everyone,@danielapastrana Cínico,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pt,False,"{'verified': False, 'profile_image_url': 'http...","{'url': 'https://t.co/mNb5ma3q2v', 'verified':...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,
4,90663539,111374936,"[{'type': 'replied_to', 'id': '158275177131083...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",2022-10-19T15:18:38.000Z,[1582753096425668608],1582753096425668608,1582751771310837766,"{'mentions': [{'start': 0, 'end': 16, 'usernam...",everyone,@danielapastrana Felicidades!!,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pt,False,"{'verified': False, 'profile_image_url': 'http...","{'url': 'https://t.co/mNb5ma3q2v', 'verified':...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,386258984,56713141,"[{'type': 'replied_to', 'id': '157569102544769...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",2022-09-30T13:12:53.000Z,[1575836077075877889],1575836077075877889,1575691024059359232,"{'mentions': [{'start': 0, 'end': 9, 'username...",everyone,@menyvazq @penileyramirez ¿O sin testar?,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",es,False,"{'protected': False, 'profile_image_url': 'htt...","{'protected': False, 'profile_image_url': 'htt...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,
169,,588749776,"[{'type': 'quoted', 'id': '1575720745996496901...","{'edits_remaining': 5, 'is_edit_eligible': Tru...",2022-09-30T12:58:57.000Z,[1575832573103992833],1575832573103992833,1575832573103992833,"{'mentions': [{'start': 29, 'end': 41, 'userna...",everyone,Los nominados de esta semana @CarlosLoret @pen...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",es,False,"{'protected': False, 'profile_image_url': 'htt...",,{'url': 'https://api.twitter.com/2/tweets/sear...,,,"[{'domain': {'id': '10', 'name': 'Person', 'de..."
170,386258984,172488257,"[{'type': 'replied_to', 'id': '157567552761604...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",2022-09-30T07:54:42.000Z,[1575756006474526720],1575756006474526720,1575675527616045059,"{'annotations': [{'start': 68, 'end': 89, 'pro...",everyone,@menyvazq @penileyramirez @SEDENAmx @CarlosLor...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",es,False,"{'protected': False, 'profile_image_url': 'htt...","{'protected': False, 'profile_image_url': 'htt...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,"[{'domain': {'id': '10', 'name': 'Person', 'de..."
171,119618685,1363534801563619328,"[{'type': 'replied_to', 'id': '157571781547413...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",2022-09-30T05:24:06.000Z,[1575718103693938688],1575718103693938688,1575555262710448128,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",everyone,@javierglz21 @LAURABARRANCO @penileyramirez Se...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",es,False,"{'protected': False, 'profile_image_url': 'htt...","{'protected': False, 'profile_image_url': 'htt...",{'url': 'https://api.twitter.com/2/tweets/sear...,,,


In [5]:
# TODO: Avoid repeated column names 

# Convert json data to pandas dataframe
df = pd.DataFrame(tweets_list)

# Normalize semi-structured JSON data into a flat table
public_metrics = pd.json_normalize(df['public_metrics'])
author = pd.json_normalize(df['author'])

# Concatenate daaframes
df_tweets = pd.concat([df, public_metrics, author], axis=1)

# Filter data
df_filtered = df_tweets[['lang', 'source', 'created_at', 'text', 'retweet_count',
                        'reply_count', 'like_count', 'quote_count', 'description', 'username',
                        'protected', 'id', 'verified', 'name', 'created_at', 'profile_image_url', 
                        'public_metrics.followers_count', 'public_metrics.following_count', 
                        'public_metrics.tweet_count', 'public_metrics.listed_count',
                        'location', 'pinned_tweet_id']]

# Show dataframe
df_filtered
# Save dataframe 
df_filtered.to_csv('../data/raw/twitter_mentions.csv', index=False)

KeyError: "['source'] not in index"