In [2]:
import tweepy
import pandas as pd
client = tweepy.Client('<BearerToken>')
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import os


The code in this notebook was used to mine and process the data that we used to train our BERT classifier.  The starting point of our data collection is a dataset we found on Kaggle https://www.kaggle.com/datasets/mrmorj/us-politicians-twitter-dataset?resource=download. This dataset is the source of the Twitter IDs used for text data collection, as well as our ground truth labels. I have done my best to make this project as replicable as possible. For my own privacy, however, I have not included the Twitter API Bearer Token that would be required to interact with the Twitter API.  As such, the code here can't be used to fetch more tweets or filter by account activity unless another Bearer token were provided. I will do my best to describe what happened during the gaps where the project is not replicable with the code presented here.  With the exception of these gaps, the code could be ran from top to bottom to reproduce the data collection and processing aspects of our project.

In [21]:
def drop_third_party(pol_data):
    """This code sets party to "Democrat" (or "Republican") if "Democrat" (or "Republican") is contained in the name of the party in the original kaggle dataset."""
    pol_data = pd.read_csv('Data\\Original_kaggle_dataset.csv')
    df = pol_data[['Account_ID', 'Name', 'Twitter_username', 'Political_party']]
    df = df.loc[df['Political_party'].str.contains('Republican') | df['Political_party'].str.contains('Democrat')]
    df = df.drop_duplicates(subset=['Account_ID'])
    df.loc[df['Political_party'].str.contains('Republican'), 'Political_party'] = 'Republican Party'
    df.loc[df['Political_party'].str.contains('Democrat'), 'Political_party'] = 'Democratic Party'
    df.to_csv('Data\\Dem_Rep_only.csv', index=False)

data = pd.read_csv('Data\\Original_kaggle_dataset.csv')
drop_third_party(data)

In [7]:
def check_if_active_and_valid(user_id: str):
    """This functions returns False if the user has not been active since October 1 of 2022,
    True otherwise.  It should also return false for bad (not numeric) IDs.
    """
    try:
        check = client.get_users_tweets(id=user_id,max_results=10, start_time='2022-10-01T00:00:00Z')
    except:
        return False
    return (not check.data is None)

The next cell won't be runable without a Bearer Token.  Its purpose is to filter our dataset down so as to only include accounts that have been active since Oct 1, 2022.  As it turns out, it is quite common for a politician to have abandonded one or more twitter accounts.  Maybe they forgot their password.  When we ran this function mid November of 2022 it filtered out nearly half of our rows, producing the dataset we called "cleanest_politicians.csv".  Note that this function is in no way certain to return the same thing from one day to the next.

In [None]:
def pol_data_cleaner_by_last_active(df):
    """This function will take a data frame representing twitter
    users, and will take out anyone that has not been active
    since Oct 1 2022.
    """
    df = df.dropna(how='any',axis=0) 
    ID_list = list(df['Account_ID'])
    ID_list_str = [str(x) for x in ID_list]
    df['Active'] = [check_if_active_and_valid(x) for x in ID_list_str]
    clean_df = df[df['Active'] == True]
    clean_df = clean_df.drop(['Active'], axis=1)
    return clean_df

data = pd.read_csv('Data\\Dem_Rep_only.csv')
clean = pol_data_cleaner_by_last_active(data)
clean.to_csv('Data\\cleanest_politicians.csv', index=False)

This next function is the one we used to get 1000 tweets per Account ID.  Initially, we sorted our data in a collection of csv files (one per politician).  It made it easier to manually inspect the data we were getting.  The data for each politician is stored in this project under Data\politician_csvs\{ID}-{Name}, but we combined the data into one csv called "test_party_IDs.csv" for training/testing the model itself.  Note: sometimes the Twitter API failed to get all 1000 tweets from a particular politician.  Those politicians were removed from the analysis.

In [None]:
def get_paged_tweets(user_id):
    """Function for gathering raw tweets for a list of user id and saving them to csv."""
    df = pd.DataFrame(columns=['TweetID', 'UserID', 'Text'])
    for tweet in tweepy.Paginator(client.get_users_tweets, user_id, exclude='retweets',
                                  max_results=100).flatten(limit=1000):
        # remove tab
        text = tweet.text.replace('\n',' ')
        #append raw tweet
        df.loc[len(df.index)] = [tweet.id, user_id, text]
    return df

df = pd.read_csv('Data\\cleanest_politicians.csv')
ID_list = list(df['Account_ID'])
name_list = list(df['Name'])

   
for i in range(0, len(ID_list)):
    file = Path('Data\\politician_csvs\\{}-{}.csv'.format((ID_list[i]), name_list[i]))
    if file.exists():
        print("passed")
        pass
    else:
        print(file)
        df = get_paged_tweets(ID_list[i])
        df.to_csv('Data\\politician_csvs\\{}-{}.csv'.format((ID_list[i]), name_list[i]))

The next cell filters out politicians that the Twitter API couldn't fetch all 1000 tweets for.  All politicians that we could get 1000 tweets from are put in other folder: "1000_only".

In [None]:
#Filter csvs such that only pol with X or more tweets included

def make_pol_csvs_min_only(csv_path: str, min_tweets: int):
    df = pd.read_csv("Data\\politician_csvs\\{}".format(csv_path), lineterminator='\n')
    if len(list(df['TweetID'])) >= min_tweets:
        print(len(list(df['TweetID'])))
        file = Path("Data\\1000_only\\{}".format(csv_path))
        if file.exists():
            pass
        else:
            os.rename("Data\\politician_csvs\\{}".format(csv_path), "Data\\1000_only\\{}".format(csv_path))

pol_csvs = os.listdir("Data\\politician_csvs")
for i in range(0, len(pol_csvs)):
    make_pol_csvs_min_only(pol_csvs[i], 1000)

Next we combine all Tweets into one csv.

In [None]:
def combine_tweet_csvs(csv_dir):
    pol_list = os.listdir(csv_dir)
    full_df = pd.DataFrame()
    for pol in pol_list:
        temp_df = pd.read_csv("{}\\{}".format(csv_dir, pol), lineterminator='\n', index_col=0)
        temp_df['UserID'] = pol.split("-")[0]
        full_df = pd.concat([temp_df, full_df])
    full_df.to_csv('Data\\{}'.format('full_csv_test.csv'), index=False)


combine_tweet_csvs("Data\\1000_only")

Now we just have to add ground truth labels and the data will be ready for BERT.  The BERT tokenizer takes care of a lot of text processing jobs for us.  

In [44]:
def add_labels(pre_data_path):
    pre_df = pd.read_csv(pre_data_path)
    label_source = pd.read_csv('Data\\Dem_Rep_only.csv')
    IDs = list(label_source['Account_ID'])
    labels = list(label_source['Political_party'])
    ID_label_dict = {IDs[i]: labels[i] for i in range(len(IDs))}
    pre_df['Party'] = np.nan
    for id in IDs:
        # This try/except is only here so that the code will run
        # without having filtered out unactive users and users
        # with a bad ID, which would require connection to the
        # twitter API.
        try:
            int_id = int(id)
            pre_df.loc[pre_df["UserID"] == int(id), 'Party'] = ID_label_dict[id]
        except:
            pass
    print(pre_df.shape)
    pre_df.to_csv('Data\\test_full_w_party.csv', index=False)



add_labels('Data\\full_csv_test.csv')



(745000, 4)
