# Senator and Candidates Tweet NLP Analysis

After pulling 100 recent tweets for all the senators and senator candidates we had on file and saving that data to a JSON file (compressed), the notebook will now conduct an NER analysis and then relate all tweets based on the NER results.  So if pairs of tweets mention the same person, then we'll have an edge "same_person", 

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from spacy import tokenizer
import compress_json
import re
import warnings
warnings.filterwarnings("ignore")

2022-11-29 18:43:47.596251: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Open the compressed json senator/candidate tweets

In [2]:
direct_tweets = compress_json.load('politician_direct_tweets.json.gz')
mention_tweets = compress_json.load('politician_mention_tweets.json.gz')

In [3]:
direct_tweets.keys()

dict_keys(['realDonaldTrump', 'JoeBiden', 'TTuberville', 'SenShelby', 'lisamurkowski', 'SenDanSullivan', 'SenatorSinema', 'CaptMarkKelly', 'Boozman4AR', 'sentomcotton', 'KamalaHarris', 'alexpadilla4ca', 'SenFeinstein', 'MichaelBennet', 'Hickenlooper', 'ChrisMurphyCT', 'SenBlumenthal', 'SenatorCarper', 'ChrisCoons', 'marcorubio', 'SenatorTimScott', 'kloeffler', 'ossoff', 'maziehirono', 'SenBrianSchatz', 'MikeCrapo', 'SenatorRisch', 'SenatorDurbin', 'senduckworth', 'ToddYoungIN', 'braun4indiana', 'ChuckGrassley', 'SenJoniErnst', 'RogerMarshallMD', 'JerryMoran', 'RandPaul', 'BillCassidy', 'JohnKennedyLA', 'SenAngusKing', 'senatorcollins', 'SenatorCardin', 'SenMarkey', 'ewarren', 'SenGaryPeters', 'SenStabenow', 'TinaSmithMN', 'SenatorWicker', 'SenHydeSmith', 'RoyBlunt', 'HawleyMO', 'SteveDaines', 'SenatorTester', 'SenatorFischer', 'SenSasse', 'RosenforNevada', 'SenatorShaheen', 'maggie_hassan', 'SenatorMenendez', 'CoryBooker', 'MartinHeinrich', 'SenSchumer', 'SenGillibrand', 'SenatorBurr',

In [4]:
direct_tweets['JoeBiden'][0]['text']

"We've taken steps to lock in vital infrastructure investments—delivering lower costs for clean energy, spurring good-paying union jobs for American workers, and advancing environmental justice in our communities. \n\nWe are proving that good climate policy is good economic policy."

In [5]:
direct_tweets['JoeBiden'][0]['public_metrics']

{'retweet_count': 2168,
 'reply_count': 3848,
 'like_count': 10301,
 'quote_count': 135}

In [6]:
mention_tweets['JoeBiden'][0]['text']

'@BarackObama @JoeBiden https://t.co/abBm6MCwwK'

## Conduct an NER analysis on all the tweets so that we can then use that information to build relationships among tweets

This will use spacy transformers to tag that tweets.

In [7]:
nlp = spacy.load('en_core_web_trf')

### NER of the Direct Tweet Data

This function conducts an NER task on all the tweets and also saves public metric information for each tweet as node attributes

In [8]:
def build_tweet_info_tables(twitter_data):
    tweet_attribs_table = []
    ner_table = []
    
    for cand in twitter_data.keys():
        for data in twitter_data[cand]:
            try:
                tweet = data['text']
                tweet = re.sub(r'http\S+', '', tweet)
                tweet = tweet.replace('\n', '').rstrip().lstrip()
                if len(tweet.split(" ")) < 5:
                    continue
            except:
                continue
            
            try:
                retweets = data['public_metrics']['retweet_count']
                replies = data['public_metrics']['reply_count']
                likes = data['public_metrics']['like_count']
                quotes = data['public_metrics']['quote_count']
                attribs_df = pd.DataFrame({'TwitterID': [cand], 'Tweet': [tweet], 'Retweets': [retweets],
                                           'Replies': [replies], 'Likes': [likes], 'Quotes': [quotes]})
                tweet_attribs_table.append(attribs_df)
            except:
                continue
            
            # Conduct NER
            doc = nlp(tweet)
            ners = list((cand, tweet, X.label_, X.text) for X in doc.ents)
            ner_df = pd.DataFrame(ners, columns = ['TwitterID', 'Tweet', 'NER_Lable', 'Text'])
            ner_table.append(ner_df)
    
    tweet_attribs_table = pd.concat(tweet_attribs_table, ignore_index = True)
    ner_table = pd.concat(ner_table, ignore_index = True)
    
    return(tweet_attribs_table, ner_table)
    

In [40]:
direct_tweet_attribs, direct_tweet_ners = build_tweet_info_tables(direct_tweets)

In [None]:
import pickle

with open('direct_tweet_attribs.pkl', 'wb') as handle:
    pickle.dump(direct_tweet_attribs, handle)

In [42]:
with open('direct_tweet_ners.pkl', 'wb') as handle:
    pickle.dump(direct_tweet_ners, handle)

In [43]:
direct_tweet_attribs.head()

Unnamed: 0,TwitterID,Tweet,Retweets,Replies,Likes,Quotes
0,realDonaldTrump,"To all of those who have asked, I will not be ...",72710,302322,551172,107559
1,realDonaldTrump,"The 75,000,000 great American Patriots who vot...",81205,134047,459591,31449
2,realDonaldTrump,I am asking for everyone at the U.S. Capitol t...,127336,376014,657602,70873
3,realDonaldTrump,Please support our Capitol Police and Law Enfo...,82977,187917,506808,41599
4,realDonaldTrump,These scoundrels are only toying with the @sen...,32160,45897,183194,3889


In [44]:
direct_tweet_ners.head()

Unnamed: 0,TwitterID,Tweet,NER_Lable,Text
0,realDonaldTrump,"To all of those who have asked, I will not be ...",EVENT,Inauguration
1,realDonaldTrump,"To all of those who have asked, I will not be ...",DATE,January 20th
2,realDonaldTrump,"The 75,000,000 great American Patriots who vot...",CARDINAL,75000000
3,realDonaldTrump,"The 75,000,000 great American Patriots who vot...",NORP,American
4,realDonaldTrump,"The 75,000,000 great American Patriots who vot...",NORP,Patriots


In [45]:
len(direct_tweet_attribs)

9947

In [46]:
len(direct_tweet_ners)

29148

In [47]:
direct_tweet_attribs.to_csv("direct_tweet_attribs.csv", index = False)
direct_tweet_ners.to_csv("direct_tweet_ners.csv", index = False)

### When running more later ... load data back in as CSV

In [3]:
direct_tweet_attribs = pd.read_csv("direct_tweet_attribs.csv")
direct_tweet_ners = pd.read_csv("direct_tweet_ners.csv")

In [4]:
direct_tweet_ners.groupby('NER_Lable').size()

NER_Lable
CARDINAL       1418
DATE           4967
EVENT           700
FAC             414
GPE            5271
LANGUAGE          6
LAW             970
LOC             378
MONEY           574
NORP           3198
ORDINAL         451
ORG            5103
PERCENT         249
PERSON         4365
PRODUCT          94
QUANTITY         79
TIME            691
WORK_OF_ART     220
dtype: int64

In [5]:
direct_tweet_ners[(direct_tweet_ners['NER_Lable'] == 'LAW') & (direct_tweet_ners['TwitterID'] == 'SenFeinstein')]

Unnamed: 0,TwitterID,Tweet,NER_Lable,Text
3236,SenFeinstein,I was proud to introduce the Respect for Marri...,LAW,the Respect for Marriage Act
3260,SenFeinstein,We’re on the cusp of removing the discriminato...,LAW,Defense of Marriage Act
3261,SenFeinstein,We can’t let the rights of married couples and...,LAW,the Respect for Marriage Act
3264,SenFeinstein,We can’t let the rights of married couples and...,LAW,Obergefell
3272,SenFeinstein,In the year since President Biden signed the B...,LAW,the Bipartisan Infrastructure Law
3276,SenFeinstein,RT @RepJerryNadler: This week the Senate will ...,LAW,my Respect for Marriage Act-
3283,SenFeinstein,It’s time to remove the shameful Defense of Ma...,LAW,Defense of Marriage Act
3286,SenFeinstein,I want to thank Majority Leader Schumer for an...,LAW,the Respect for Marriage Act
3342,SenFeinstein,It’s been 28 years since President Clinton sig...,LAW,the California Desert Protection Act
3389,SenFeinstein,From raising the age to purchase assault weapo...,LAW,the Assault Weapons Ban


Drop Trump, older twitter data

In [6]:
direct_tweet_ners = direct_tweet_ners[(direct_tweet_ners['TwitterID'] != 'realDonaldTrump')]
direct_tweet_attribs = direct_tweet_attribs[(direct_tweet_attribs['TwitterID'] != 'realDonaldTrump')]

In [7]:
len(direct_tweet_ners)

28995

### Select the NER lables we want to keep for the graph building

In [8]:
ner_lables = ['EVENT', 'FAC', 'GPE', 'LAW', 'LOC', 'NORP', 'ORG', 'PERSON']

Filter to only include NER labels we want

In [9]:
select_ner_tweets = direct_tweet_ners[(direct_tweet_ners['NER_Lable'].isin(ner_lables))]

Remove special characters from text

In [10]:
select_ner_tweets['Text'] = select_ner_tweets["Text"].str.replace(r"[^a-zA-Z ]+", " ").str.strip()
select_ner_tweets['Text'] = select_ner_tweets['Text'].replace('"', "")
select_ner_tweets['Text'] = select_ner_tweets['Text'].replace(',', "")
select_ner_tweets['Text'] = select_ner_tweets['Text'].replace('.', "")
select_ner_tweets['Text'] = select_ner_tweets['Text'].replace('!', "")
select_ner_tweets['Text'] = select_ner_tweets['Text'].replace('?', "")
select_ner_tweets['Text'] = select_ner_tweets['Text'].replace('#', "")
select_ner_tweets['Text'] = select_ner_tweets['Text'].replace('-', "")
select_ner_tweets['Text'] = [tweet.lstrip() for tweet in list(select_ner_tweets['Text'])]
select_ner_tweets['Text'] = [tweet.rstrip() for tweet in list(select_ner_tweets['Text'])]
select_ner_tweets['Text'] = [' '.join(tweet.split()) for tweet in list(select_ner_tweets['Text'])]

In [11]:
law_ners = select_ner_tweets[(select_ner_tweets['NER_Lable'] == 'LAW')]

In [12]:
law_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False)

Unnamed: 0,Text,Count
351,the Inflation Reduction Act,79
401,the Respect for Marriage Act,44
383,the PACT Act,38
283,the Bipartisan Infrastructure Law,34
167,Roe v Wade,26
...,...,...
163,RespectforMarriageAct I,1
162,RespectforMarriage Act,1
160,Respe,1
158,RI,1


In [13]:
gpe_ners = select_ner_tweets[(select_ner_tweets['NER_Lable'] == 'GPE')]

In [14]:
gpe_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False)

Unnamed: 0,Text,Count
20,America,355
842,U S,245
907,Washington,173
860,Ukraine,143
142,China,112
...,...,...
413,Keene,1
414,Kennesaw,1
415,Kennett,1
416,Kennywood,1


In [15]:
event_ners = select_ner_tweets[(select_ner_tweets['NER_Lable'] == 'EVENT')]

In [16]:
event_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False)

Unnamed: 0,Text,Count
36,COP,19
71,Diwali,18
369,WWII,15
138,Hurricane Ian,14
381,World War II,12
...,...,...
167,KeepKidsSafe Gun BuyBack,1
166,Kansas Conservation Tour,1
165,JPSS Launch,1
164,Iraq War,1


In [17]:
fac_ners = select_ner_tweets[(select_ner_tweets['NER_Lable'] == 'FAC')]
loc_ners = select_ner_tweets[(select_ner_tweets['NER_Lable'] == 'LOC')]
norp_ners = select_ner_tweets[(select_ner_tweets['NER_Lable'] == 'NORP')]
org_ners = select_ner_tweets[(select_ner_tweets['NER_Lable'] == 'ORG')]
person_ners = select_ner_tweets[(select_ner_tweets['NER_Lable'] == 'PERSON')]

In [18]:
print(fac_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False))
print(loc_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False))
print(norp_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False))
print(org_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False))
print(person_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False))

                                 Text  Count
26                            Capitol     14
314                   the White House      8
112                       Main Street      6
292                   the Silver Line      5
303        the Tree of Life Synagogue      5
..                                ...    ...
111             MacroTechnology Works      1
110              LuLu Ross Elementary      1
109                        Long Wharf      1
108                   Lizards Thicket      1
321  theGrandFarm Innovation Facility      1

[322 rows x 2 columns]
                   Text  Count
48               Europe     13
230     the Middle East      9
38                Earth      6
107            Normandy      6
10               Arctic      6
..                  ...    ...
105  Newburyport Harbor      1
108            North AL      1
109        North Africa      1
110         North Idaho      1
263    upstate New York      1

[264 rows x 2 columns]
                     Text  Count
15     

### Filter to onnly innclude NER Lable texts that appear in at least three tweets

In [19]:
def filter_ner_results(twitter_ners, ner_lables):
    
    twitter_ners = twitter_ners.drop_duplicates()
    
    kept_tweets = []
    
    for ner in ner_lables:
        ner_data = twitter_ners[(twitter_ners['NER_Lable'] == ner)]
        ner_grouped = ner_data.groupby("Text").size().reset_index(name = 'Count')
        ners_kept = ner_grouped[(ner_grouped['Count'] >= 3)]
        ners_kept = list(ners_kept['Text'])
        kept_tweets.append(ner_data[(ner_data['Text'].isin(ners_kept))])

    kept_tweets = pd.concat(kept_tweets, ignore_index = True)
    kept_tweets = kept_tweets.drop_duplicates()
    return(kept_tweets)

In [20]:
select_tweets_filtered = filter_ner_results(select_ner_tweets, ner_lables)

In [21]:
select_tweets_filtered = select_tweets_filtered.drop_duplicates()

In [22]:
select_tweets_filtered.head()

Unnamed: 0,TwitterID,Tweet,NER_Lable,Text
0,JoeBiden,"Election Day is tomorrow, folks. Confirm your ...",EVENT,Election Day
1,TTuberville,RT @SenTuberville: Honored to join Mr. Frank C...,EVENT,WWII
2,SenShelby,Happy Independence Day! Today we celebrate 246...,EVENT,Independence Day
3,SenShelby,Today I reviewed the FY23 request for the @USN...,EVENT,COVID
4,lisamurkowski,Native American Heritage Month is an opportuni...,EVENT,Native American Heritage Month


In [23]:
len(select_tweets_filtered)

12607

In [24]:
total_tweets = select_tweets_filtered.drop_duplicates(subset = ['TwitterID', 'Tweet'])
len(total_tweets)

6973

In [25]:
fac_ners = select_tweets_filtered[(select_tweets_filtered['NER_Lable'] == 'FAC')]
fac_ners.groupby("Text").size().reset_index(name = 'Count').sort_values(by = 'Count', ascending = False)

Unnamed: 0,Text,Count
0,Capitol,14
14,the White House,8
3,Main Street,6
10,the Silver Line,5
12,the Tree of Life Synagogue,5
4,Metro,4
6,Route,4
8,Tree of Life Synagogue,4
9,the National Mall,4
1,Club Q,3


## With the chosen data, creat the graph into a CSV file

### (1) Generate Node File

In [26]:
direct_tweet_attribs.head()

Unnamed: 0,TwitterID,Tweet,Retweets,Replies,Likes,Quotes
67,JoeBiden,We've taken steps to lock in vital infrastruct...,2168,3848,10301,135
68,JoeBiden,"RT @POTUS: When I think of Nancy Pelosi, I thi...",23121,0,0,0
69,JoeBiden,Today is the one-year anniversary of the Bipar...,4482,3271,23506,210
70,JoeBiden,One year after the Bipartisan Infrastructure L...,2303,1430,11499,94
71,JoeBiden,The difference between talking and delivering.,18605,15004,92301,2077


In [27]:
direct_tweet_attribs = direct_tweet_attribs.drop_duplicates(subset = ['TwitterID', 'Tweet'])

In [28]:
direct_tweet_attribs['Identifier'] = direct_tweet_attribs['TwitterID'] + "/" + direct_tweet_attribs['Tweet']

In [29]:
direct_tweet_attribs.head()

Unnamed: 0,TwitterID,Tweet,Retweets,Replies,Likes,Quotes,Identifier
67,JoeBiden,We've taken steps to lock in vital infrastruct...,2168,3848,10301,135,JoeBiden/We've taken steps to lock in vital in...
68,JoeBiden,"RT @POTUS: When I think of Nancy Pelosi, I thi...",23121,0,0,0,JoeBiden/RT @POTUS: When I think of Nancy Pelo...
69,JoeBiden,Today is the one-year anniversary of the Bipar...,4482,3271,23506,210,JoeBiden/Today is the one-year anniversary of ...
70,JoeBiden,One year after the Bipartisan Infrastructure L...,2303,1430,11499,94,JoeBiden/One year after the Bipartisan Infrast...
71,JoeBiden,The difference between talking and delivering.,18605,15004,92301,2077,JoeBiden/The difference between talking and de...


In [30]:
select_tweets_filtered['Identifier'] = select_tweets_filtered['TwitterID'] + "/" + select_tweets_filtered['Tweet']

In [31]:
select_tweets_filtered.head()

Unnamed: 0,TwitterID,Tweet,NER_Lable,Text,Identifier
0,JoeBiden,"Election Day is tomorrow, folks. Confirm your ...",EVENT,Election Day,"JoeBiden/Election Day is tomorrow, folks. Conf..."
1,TTuberville,RT @SenTuberville: Honored to join Mr. Frank C...,EVENT,WWII,TTuberville/RT @SenTuberville: Honored to join...
2,SenShelby,Happy Independence Day! Today we celebrate 246...,EVENT,Independence Day,SenShelby/Happy Independence Day! Today we cel...
3,SenShelby,Today I reviewed the FY23 request for the @USN...,EVENT,COVID,SenShelby/Today I reviewed the FY23 request fo...
4,lisamurkowski,Native American Heritage Month is an opportuni...,EVENT,Native American Heritage Month,lisamurkowski/Native American Heritage Month i...


In [32]:
kept_tweets = select_tweets_filtered.drop_duplicates(subset = ['Identifier'])
kept_tweets = list(kept_tweets['Identifier'])
tweet_node_list = direct_tweet_attribs[(direct_tweet_attribs['Identifier'].isin(kept_tweets))]
len(tweet_node_list)

6973

In [33]:
tweet_node_list.head()

Unnamed: 0,TwitterID,Tweet,Retweets,Replies,Likes,Quotes,Identifier
67,JoeBiden,We've taken steps to lock in vital infrastruct...,2168,3848,10301,135,JoeBiden/We've taken steps to lock in vital in...
68,JoeBiden,"RT @POTUS: When I think of Nancy Pelosi, I thi...",23121,0,0,0,JoeBiden/RT @POTUS: When I think of Nancy Pelo...
69,JoeBiden,Today is the one-year anniversary of the Bipar...,4482,3271,23506,210,JoeBiden/Today is the one-year anniversary of ...
70,JoeBiden,One year after the Bipartisan Infrastructure L...,2303,1430,11499,94,JoeBiden/One year after the Bipartisan Infrast...
72,JoeBiden,"In this year’s midterm elections, we saw the s...",4822,6200,25236,306,"JoeBiden/In this year’s midterm elections, we ..."


In [34]:
kept_tweets_id = {}
i = 0
for tweet_id in kept_tweets:
    kept_tweets_id[tweet_id] = i
    i += 1

identifiers = []
for ind in tweet_node_list.index:
    ident = kept_tweets_id[tweet_node_list['Identifier'][ind]]
    identifiers.append(ident)
tweet_node_list = tweet_node_list.drop('Identifier', axis = 1)
tweet_node_list['Identifier'] = identifiers

identifiers = []
for ind in select_tweets_filtered.index:
    ident = kept_tweets_id[select_tweets_filtered['Identifier'][ind]]
    identifiers.append(ident)
select_tweets_filtered = select_tweets_filtered.drop('Identifier', axis = 1)
select_tweets_filtered['Identifier'] = identifiers

In [35]:
tweet_node_list.head()

Unnamed: 0,TwitterID,Tweet,Retweets,Replies,Likes,Quotes,Identifier
67,JoeBiden,We've taken steps to lock in vital infrastruct...,2168,3848,10301,135,3800
68,JoeBiden,"RT @POTUS: When I think of Nancy Pelosi, I thi...",23121,0,0,0,6267
69,JoeBiden,Today is the one-year anniversary of the Bipar...,4482,3271,23506,210,3404
70,JoeBiden,One year after the Bipartisan Infrastructure L...,2303,1430,11499,94,3405
72,JoeBiden,"In this year’s midterm elections, we saw the s...",4822,6200,25236,306,3801


In [36]:
select_tweets_filtered.head()

Unnamed: 0,TwitterID,Tweet,NER_Lable,Text,Identifier
0,JoeBiden,"Election Day is tomorrow, folks. Confirm your ...",EVENT,Election Day,0
1,TTuberville,RT @SenTuberville: Honored to join Mr. Frank C...,EVENT,WWII,1
2,SenShelby,Happy Independence Day! Today we celebrate 246...,EVENT,Independence Day,2
3,SenShelby,Today I reviewed the FY23 request for the @USN...,EVENT,COVID,3
4,lisamurkowski,Native American Heritage Month is an opportuni...,EVENT,Native American Heritage Month,4


In [37]:
select_tweets_filtered[(select_tweets_filtered['Identifier'] == 3404)]

Unnamed: 0,TwitterID,Tweet,NER_Lable,Text,Identifier
4278,JoeBiden,Today is the one-year anniversary of the Bipar...,LAW,the Bipartisan Infrastructure Law,3404
10300,JoeBiden,Today is the one-year anniversary of the Bipar...,PERSON,Eisenhower,3404


### (2) Generate the Tweet Relationship File

Tweets will be related onotolgically with one of the eight NER labels.  If two tweets share the same NER label, such as both mentioning the EVENT NER "Native American Heritage Month", they will be linked with the ontology "EVENT" and the edge will be labled with with NER text

In [37]:
select_tweets_filtered[(select_tweets_filtered['NER_Lable'] == "FAC") & (select_tweets_filtered['Text'] == "the White House")]

Unnamed: 0,TwitterID,Tweet,NER_Lable,Text,Identifier
202,sentomcotton,"Joe Biden is hosting a ""unity"" summit today at...",FAC,the White House,191
205,MichaelBennet,Donald Trump is running for president again in...,FAC,the White House,193
208,ChrisCoons,Time to spruce🎄things up at the White House! A...,FAC,the White House,196
215,SenBrianSchatz,We enacted the biggest climate action in U.S. ...,FAC,the White House,203
224,ewarren,Ivory-tower economists and out-of-touch pundit...,FAC,the White House,212
233,kevincramer,Heading to the White House for ⁦@realDonaldTru...,FAC,the White House,220
243,LindseyGrahamSC,▶️ Biden Administration will not change course...,FAC,the White House,230
264,johnfetterman,PA proud. A beautiful 🎄 from Schuylkill County...,FAC,the White House,250


In [55]:
def convert_to_edgelist(ner_tweets):
    
    edgelist = []
    ner_tweets['NER/Text'] = ner_tweets['NER_Lable'] + "/" + ner_tweets['Text']
    identifiers = ner_tweets.drop_duplicates(subset = ['Identifier'])
    identifiers = list(identifiers['Identifier'])
    for i in identifiers:
        i_list = ner_tweets[(ner_tweets['Identifier'] == i)]
        i_ner_text = i_list.drop_duplicates(subset = ['NER/Text'])
        i_ner_text = list(i_ner_text['NER/Text'])
        i_matches = ner_tweets[(ner_tweets['NER/Text'].isin(i_ner_text)) & (ner_tweets['Identifier'] != i)]
                          
        # Go through each match, pull the tweet ID and ner lable and text, link it to i
        for ind in i_matches.index:
            id2 = i_matches['Identifier'][ind]
            ner = i_matches['NER_Lable'][ind]
            text = i_matches['Text'][ind]
            df = pd.DataFrame({"TweetID1": [i], "TweetID2": [id2], "Relation": [ner], "Lable": [text]})
            edgelist.append(df)
    
    edgelist = pd.concat(edgelist, ignore_index = True)
    return(edgelist)

In [56]:
tweet_graph = convert_to_edgelist(select_tweets_filtered)

In [57]:
len(tweet_graph)

1664474

In [58]:
tweet_graph.head()

Unnamed: 0,TweetID1,TweetID2,Relation,Lable
0,0,101,EVENT,Election Day
1,0,143,EVENT,Election Day
2,1,7,EVENT,WWII
3,1,17,EVENT,WWII
4,1,59,EVENT,WWII


In [59]:
import pickle

with open('tweet_graph.pkl', 'wb') as handle:
    pickle.dump(tweet_graph, handle)

## Load in pickled graph file, save that and tweet node file to CSV

In [2]:
import pickle

with open('tweet_graph.pkl', 'rb') as handle:
    tweet_graph = pickle.load(handle)

In [38]:
tweet_graph.to_csv("political_tweet_graph.csv", index = False)
tweet_node_list.to_csv("political_tweets.txt", sep = '\t', index = False)