In [163]:
import pandas as pd
import re

## Read in and process datafile with themes/codes

In [164]:
df = pd.read_csv("Final Coding.csv",keep_default_na=False)
#The codes were in 5 different columns; lets aggregate them into a new column
df["all codes"] = df[["SO code","BC code","BtC code","CCM code","SJEDI code"]].agg(';'.join,axis=1)
#We separated them by ;, so convert to an array 
df["all codes"] = df["all codes"].apply(lambda s: s.split(';'))
#Strip out leading/trailing spaces
df["all codes"] = df["all codes"].apply(lambda s: [t.strip() for t in s if t])
#Sort the arrays, so we can be confident the same things are not listed in two different ways
df["all codes"] = df["all codes"].apply(sorted)
#Convert back to string with ; between the codes
df["all codes"] = df["all codes"].apply(lambda s: ';'.join(s))

#Merge info on JMM attendance
df_names = pd.read_csv("Names.csv")
df = df.merge(df_names,left_on='user_username',right_on='User Name',how='left')
df=df.rename(columns={'In JMM 2020 Program?': 'At JMM 2020', 'In JMM 2021 Program': 'At JMM 2021', 'Too anonymous to determine':'Anonymous'})

#Check merge
print('There were ' + str(len(df['Display Name'].unique())) + ' users in the dataset')

#Get mentions from tweet text
df['Mentions']= df['text'].apply(lambda s: re.findall("\B@\w+", s))

#Get list of the codes
codes = df['all codes'].apply(lambda s: s.split(';')).explode().unique()
#Make a new column for each code, and mark it true/false if that code appears in 'all codes' column
for code in codes:
    df[code] = df['all codes'].apply(lambda s: code in s)


#Make dictionaries of user_ids to usernamess, and tweetids to usernames
users=df[['user_username','author_id']].drop_duplicates()        #Duplicates make python dictionaries upset
user_dict = dict(zip(users['author_id'],users['user_username']))

tweet_dict=dict(zip(df['tweet_id'],df['user_username']))


#Lookup functions for tweetids and userids
def tweeter_lookup(tweetid):
    if tweetid == "":
        return ""
    elif int(tweetid) in tweet_dict.keys():
        return tweet_dict[int(tweetid)]
    else:
        return ""
    
def lookup_username(s):
    if s=='':
        return ''
    elif int(s) not in user_dict.keys():
        return "Error with id "+str(s)
    else:
        return user_dict[int(s)]

#Get receivers for replies
df['Receiver']=''
df['Receiver'].mask(df['in_reply_to_user_id']!='NA',df['in_reply_to_user_id'],inplace=True)
df['Receiver'] = df['Receiver'].apply(lookup_username)

#Get recievers for quote tweets and retweets
df['sourcetweeter']=''
df['sourcetweeter'].mask(df['sourcetweet_type']=='quoted', df['sourcetweet_id'],inplace=True )
df['sourcetweeter'].mask(df['sourcetweet_type']=='retweeted', df['sourcetweet_id'],inplace=True )
df['sourcetweeter']=df['sourcetweeter'].apply(tweeter_lookup)
df['Receiver'].mask(df['sourcetweeter']!="",df['sourcetweeter'],inplace=True)


#Time binning
df['date']=df['created_at'].apply(lambda s: str(s)[0:10])
df[df['date']>'2020-01-16'].head(20)
time_bins = ['Pre JMM 2020', 'JMM 2020', 'Inter JMM', 'JMM 2021', 'Post JMMs']
#JMM 2020 time inverval: Jan 12 -- Jan 25
#JMM 2021 time interval: Jan 3 -- Jan 16
df[time_bins[0]]=False
df[time_bins[0]].mask(df['date']<'2020-01-12',True,inplace=True)
df[time_bins[1]]=False
df[time_bins[1]].mask((df['date']>='2020-01-12') & (df['date']<='2020-01-25'),True,inplace=True)
df[time_bins[2]]=False
df[time_bins[2]].mask((df['date']>'2020-01-25') & (df['date']<'2021-01-03'),True,inplace=True)
df[time_bins[3]]=False
df[time_bins[3]].mask((df['date']>='2021-01-03') & (df['date']<='2021-01-16'),True,inplace=True)
df[time_bins[4]]=False
df[time_bins[4]].mask(df['date']>'2021-01-16',True,inplace=True)

#Clean up the frame a bit
df=df.drop(columns=['sourcetweeter'])



df.to_csv("all-data.csv")


There were 327 users in the dataset


## Codes by time bucket

In [165]:
#Count the codes in each time bin, store in a list.
#Maybe there is a slicker way to get this into a df directly?
codes_by_time=[]
for time in time_bins:
   codes_by_time.append(df.groupby([time]).sum()[codes].transpose()[True])

#dataframe from the list
cbtdf=pd.DataFrame(codes_by_time)
#Labels
cbtdf['times']=time_bins
cbtdf.set_index('times',inplace=True)
#Transpose for easier reading
cbtdf.transpose().to_csv("codes-by-time-bucket.csv")

## Generate detailed edge list for people network

In [166]:
#Clean up a bit
df['Sender']=df['user_username']
themes = ['Self-organization','Building community','Broadening the counterpublic','Creating change in math','SJEDI']
#Convert themes 1/0 into True/False
for column in themes:
    df[column]=df[column].apply(bool)

edges_verbose=df[df['Receiver']!=''].copy()
#Include only tweets with a Receiver
edges_verbose=edges_verbose[edges_verbose['Receiver'].str.contains('Error')==False]
#Only the columns we need to make it a bit more manageable
cols=['Sender','Receiver','text','created_at']+themes+codes.tolist()+time_bins

edges_verbose[cols].to_csv("people-edge-list.csv")



## People network with mention edge list

In [167]:
#Mentions column is a list, convert to one row for each person mentioned in the list
mention_df=df.explode(['Mentions'])
#Drop the rows with na, i.e. those that didn't mention anyone
mention_df=mention_df[mention_df['Mentions'].notna()]
#Mentions have an @ in front of usernames, drop this to match format of Receiver column
mention_df['Mentions']=mention_df['Mentions'].apply(lambda s: str(s)[1:])
#The first user mentioned in a reply is already represented as an edge, so drop from this set
mention_df=mention_df[mention_df['Mentions']!=mention_df['Receiver']]
mention_df['Receiver']=mention_df['Mentions']

mention_df[cols].to_csv("people-nodes-mention-edge-list.csv")
