## Preprocessing of congressmembers' twitter accounts
This notebook performs preprocessing, taking the account names of members of the 117th Congress, converting them to user ids, and then obtaining the timelines associated with those users.

In [21]:
# this is a function that takes a string and the name of a file as inputs and writes a text file with the string inside.
def string_to_txt(string, file_name):
    #open text file
    txt_file = open(file_name, "w")
 
    #write string to file
    txt_file.write(string)
 
    #close file
    txt_file.close()

#### Create text file of congress accounts

In [20]:
import pandas as pd

# read the csv file, select the links, select the last split (the account), and write to a string
accounts = pd.read_csv('congress_twitter_117th_combined.csv').Link.astype('str').apply(lambda x: x.split('/')[-1]).to_csv(header=False,index=False)

string_to_txt(accounts, "twitter_accounts_117th_congress.txt")

#### Using twarc to get user IDs from accounts, then get the timelines of those users

In [13]:
# How to get IDs from account names
!twarc2 users --usernames twitter_accounts_117th_congress.txt user_ids_117th_congress.jsonl  

100%|███████████| Processed 537/537 lines of input file [00:06<00:00, 83.63it/s]


In [22]:
# Now that we have twitter IDs of congress, convert to a text file.
import pandas as pd
raw_json = pd.read_json('user_ids_117th_congress.jsonl', lines=True)['data']
congress_ids = pd.concat([pd.DataFrame(raw_json[0]), pd.DataFrame(raw_json[1]), pd.DataFrame(raw_json[2]), pd.DataFrame(raw_json[3]),  pd.DataFrame(raw_json[4]), pd.DataFrame(raw_json[5])],  axis=0)
ids = congress_ids.id.to_csv(header=False, index=False)

string_to_txt(ids, "twitter_ids_117th_congress.txt")

In [24]:
# How to get all twitter history from users of congress:
!twarc2 timelines twitter_ids_117th_congress.txt --start-time "2022-07-01" --end-time "2022-10-01" > timelines/july_sep_2022_congress_twitter_activity.jsonl