# Building the retweet network (part 2)

Here the tweets list will be converted in a list of edges (A -> B iff B retweeted A)

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import csv

In [2]:
!head network_tweets.csv

id,created_at,user_id,in_reply_to_status_id,retweeted_status_id,quoted_status_id
1340465356964585474,2020-12-20 01:13:40+00:00,398032197,,1340235113280512000,
1340465632177995776,2020-12-20 01:14:46+00:00,1038231816,,1340372051840950272,
1340465924932063236,2020-12-20 01:15:55+00:00,457762156,,1340309122730893312,
1340465989436268547,2020-12-20 01:16:11+00:00,1312802310830125057,,1340391589135454208,
1340466102380457984,2020-12-20 01:16:38+00:00,1030696837748060160,,1340408286730989568,
1340466213240135682,2020-12-20 01:17:04+00:00,1043948268932214786,1340457454975524864,,
1340466248782655488,2020-12-20 01:17:13+00:00,922839841,,1340311332906790912,
1340466401111388162,2020-12-20 01:17:49+00:00,1267671279718535174,,,1202520353836347392
1340466489401503749,2020-12-20 01:18:10+00:00,1249678210314502144,,,


In [3]:
# Read the tweets
col_dtypes = {'id': str, 'user_id': str,
              'retweeted_status_id': str,
              'in_reply_to_status_id': str,
              'quoted_status_id': str}

tweets_df = pd.read_csv("network_tweets.csv",
                        dtype=col_dtypes,
                        #nrows=1000000, # For Debug
                        parse_dates=['created_at'])

In [4]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12687029 entries, 0 to 12687028
Data columns (total 6 columns):
 #   Column                 Dtype              
---  ------                 -----              
 0   id                     object             
 1   created_at             datetime64[ns, UTC]
 2   user_id                object             
 3   in_reply_to_status_id  object             
 4   retweeted_status_id    object             
 5   quoted_status_id       object             
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 580.8+ MB


In [5]:
tweets_df

Unnamed: 0,id,created_at,user_id,in_reply_to_status_id,retweeted_status_id,quoted_status_id
0,1340465356964585474,2020-12-20 01:13:40+00:00,398032197,,1340235113280512000,
1,1340465632177995776,2020-12-20 01:14:46+00:00,1038231816,,1340372051840950272,
2,1340465924932063236,2020-12-20 01:15:55+00:00,457762156,,1340309122730893312,
3,1340465989436268547,2020-12-20 01:16:11+00:00,1312802310830125057,,1340391589135454208,
4,1340466102380457984,2020-12-20 01:16:38+00:00,1030696837748060160,,1340408286730989568,
...,...,...,...,...,...,...
12687024,1436117027668860930,2021-09-09 23:59:15+00:00,2154794697,1436112907436236800,,
12687025,1436117030978166787,2021-09-09 23:59:16+00:00,245313164,,1436059926762344448,
12687026,1436117059033878530,2021-09-09 23:59:23+00:00,117172862,,1436059926762344448,
12687027,1436117138260074501,2021-09-09 23:59:42+00:00,4788106457,,1435954004031324160,


retweeted_status_id is the id of the original tweet but we need the id of the user that posted it!

In [6]:
# 1. Create an index for all non-retweets (tweet_id: user_id) for faster user_id resolution
# since we are resolving retweets orginal tweet user_id we can skip retweets indexing
non_retweets_index = tweets_df[["id", "user_id", "retweeted_status_id"]]
non_retweets_index = non_retweets_index[non_retweets_index['retweeted_status_id'].isna()]
non_retweets_index = non_retweets_index.rename(columns={"id": "tweet_id"}).set_index("tweet_id")
non_retweets_index = non_retweets_index.drop('retweeted_status_id', axis=1)
# Note: Non-retweets are: original tweets, replies and qoutations.

# If you are not agree, you can try indexing all tweets and you will end up with same results
# retweets_index = tweets_df[["id", "user_id"]]
# retweets_index = retweets_index.rename(columns={"id": "tweet_id"}).set_index("tweet_id")
# resolved_df = tweets_df.merge(tweet_index, left_on='retweeted_status_id', right_index=True)

In [7]:
non_retweets_index

Unnamed: 0_level_0,user_id
tweet_id,Unnamed: 1_level_1
1340466213240135682,1043948268932214786
1340466401111388162,1267671279718535174
1340466489401503749,1249678210314502144
1340466830633267205,3437054038
1340466917975461893,1134486713697472512
...,...
1436117022614736898,2842808602
1436117023159988224,4702381603
1436117025466765316,1052272086364774400
1436117027668860930,2154794697


In [8]:
resolved_df = tweets_df.merge(non_retweets_index, left_on='retweeted_status_id', right_index=True)

In [9]:
resolved_df

Unnamed: 0,id,created_at,user_id_x,in_reply_to_status_id,retweeted_status_id,quoted_status_id,user_id_y
30,1340468342524694530,2020-12-20 01:25:32+00:00,4363161567,,1340467922536378368,,2877999736
15142,1340741209774546951,2020-12-20 19:29:48+00:00,488762764,,1340467922536378368,,2877999736
248,1340503740433002496,2020-12-20 03:46:11+00:00,1277804951842979845,,1340503265373466624,,242880993
319,1340515040370278402,2020-12-20 04:31:05+00:00,1102714987787837445,,1340503265373466624,,242880993
374,1340521296531218438,2020-12-20 04:55:57+00:00,69920939,,1340503265373466624,,242880993
...,...,...,...,...,...,...,...
12685761,1436097813503516675,2021-09-09 22:42:54+00:00,1212190158134448128,,1436089909090164736,,481375605
12685376,1436094038332092418,2021-09-09 22:27:54+00:00,1237938136958844929,,1436093706311020544,,734074574275514368
12685694,1436097090199949312,2021-09-09 22:40:02+00:00,1197983970417348609,,1436096621226430464,,447874305
12685777,1436097947935154179,2021-09-09 22:43:26+00:00,372937945,,1436077788256251904,,734074574275514368


Reminder: retweet network edge = A -> B if B retweeted A (think it as information propagate from A to B if B retweet)

In [10]:
resolved_df["edge"] = list(zip(resolved_df.user_id_y, resolved_df.user_id_x))

In [11]:
resolved_df

Unnamed: 0,id,created_at,user_id_x,in_reply_to_status_id,retweeted_status_id,quoted_status_id,user_id_y,edge
30,1340468342524694530,2020-12-20 01:25:32+00:00,4363161567,,1340467922536378368,,2877999736,"(2877999736, 4363161567)"
15142,1340741209774546951,2020-12-20 19:29:48+00:00,488762764,,1340467922536378368,,2877999736,"(2877999736, 488762764)"
248,1340503740433002496,2020-12-20 03:46:11+00:00,1277804951842979845,,1340503265373466624,,242880993,"(242880993, 1277804951842979845)"
319,1340515040370278402,2020-12-20 04:31:05+00:00,1102714987787837445,,1340503265373466624,,242880993,"(242880993, 1102714987787837445)"
374,1340521296531218438,2020-12-20 04:55:57+00:00,69920939,,1340503265373466624,,242880993,"(242880993, 69920939)"
...,...,...,...,...,...,...,...,...
12685761,1436097813503516675,2021-09-09 22:42:54+00:00,1212190158134448128,,1436089909090164736,,481375605,"(481375605, 1212190158134448128)"
12685376,1436094038332092418,2021-09-09 22:27:54+00:00,1237938136958844929,,1436093706311020544,,734074574275514368,"(734074574275514368, 1237938136958844929)"
12685694,1436097090199949312,2021-09-09 22:40:02+00:00,1197983970417348609,,1436096621226430464,,447874305,"(447874305, 1197983970417348609)"
12685777,1436097947935154179,2021-09-09 22:43:26+00:00,372937945,,1436077788256251904,,734074574275514368,"(734074574275514368, 372937945)"


In [12]:
# Show the count of unique edges
resolved_df.edge.nunique()

1006459

In [13]:
retweets_edgelist_df = resolved_df[["edge", "id"]].groupby("edge").count()
retweets_edgelist_df = retweets_edgelist_df.sort_values(by="id", ascending=False).reset_index()

retweets_edgelist_df[["source", "retweeter"]] = pd.DataFrame(retweets_edgelist_df["edge"].tolist(),
                                                             index=retweets_edgelist_df.index)

retweets_edgelist_df = retweets_edgelist_df[["source", "retweeter", "id"]]
retweets_edgelist_df.rename(columns={"id": "weight"}, inplace=True)

In [14]:
retweets_edgelist_df

Unnamed: 0,source,retweeter,weight
0,1248341835770200064,1248341835770200064,1308
1,150725695,3613396888,379
2,150725695,841701064118284288,326
3,52424550,841701064118284288,320
4,331617619,1084450777748459520,308
...,...,...,...
1006454,1683455144,751020639587332102,1
1006455,1683455144,752255117232054276,1
1006456,1683455144,752845778,1
1006457,1683455144,756947491787010048,1


In [15]:
retweets_edgelist_df.to_csv("edgelist.csv")