In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [3]:
from shared.paths import DatasetPath

DS = DatasetPath('social-distancing-student')

In [4]:
spark = (SparkSession.builder
         .appName(str(DS))
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/06 00:50:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
DATE_FORMAT = "EEE MMM dd HH:mm:ss '+0000' yyyy"
probs_schema = T.StructType([
    T.StructField('SUPPORTS', T.FloatType(), False),
    T.StructField('IRRELEVANT', T.FloatType(), False),
    T.StructField('REJECTS', T.FloatType(), False),
])

df = (
    spark.read.format('org.apache.spark.sql.json')
        .load(DS.raw_str('social-distancing-student.json'))
        .withColumn('created_at', F.unix_timestamp(F.to_timestamp(F.col('created_at'), 'EEE MMM dd HH:mm:ss \'+0000\' yyyy')))
        .withColumn('probabilities_social_distancing', F.from_json('probabilities_social_distancing', probs_schema))
).cache()
df.head(1)

                                                                                

[Row(created_at=1601496062, entities=Row(hashtags=[], media=None, symbols=[], urls=[Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1311425852169744385', indices=[117, 140], url='https://t.co/EA6OO7cWR5')], user_mentions=[Row(id=2493701, id_str='2493701', indices=[0, 5], name='NU.nl', screen_name='NUnl')]), full_text='@NUnl Waarom niet de dove mens het mondkapje op en degene die spreekt geen monkapje. Als één van twee het maar op heeft en wel afstand houden. ik ben SH en heb erg last van mensen die te snel praten en te zacht. Wat doe ik hier aan. Gewoon maar vragen denk ik?', id_str='1311425852169744385', in_reply_to_status_id_str='1311352289928581120', in_reply_to_user_id_str='2493701', label_social_distancing='SUPPORTS', probabilities_social_distancing=Row(SUPPORTS=0.9821312427520752, IRRELEVANT=0.014853371307253838, REJECTS=0.003045313758775592), quoted_status_id_str='', sentiment=-0.6, text='@NUnl Waarom niet de dove mens het mondkapje 

In [6]:
users_tweet_df = df.select(
    'user.name', 'user.screen_name', 'user.id_str'
)
user_mentions_df = (
    df.select(F.explode('entities.user_mentions').alias('user'))
        .select(
        'user.name',
        'user.screen_name',
        'user.id_str'
    )
)
df_nodes_users_tmp = (
    users_tweet_df
        .union(user_mentions_df)
        .select(
            'name',
            'screen_name',
            F.col('id_str').alias('id'),
        )
        .dropDuplicates(['id'])
)
df_nodes_users_tmp.head(5)

                                                                                

[Row(name='DLRP-Magic.com', screen_name='DLRP_Magic', id='100002919'),
 Row(name='Russel⁷ ✜ +×', screen_name='HoldOnSoobin', id='1000331230565617666'),
 Row(name='abkuijer❌❌❌', screen_name='abkuijer', id='10003862'),
 Row(name='Koen', screen_name='koen0612', id='1000414316238172163'),
 Row(name='Wouter van Embden', screen_name='EmbdenWouter', id='1000464536082747393')]

In [7]:
df_nodes_hashtags_tmp = (
    df
        .select(F.explode('entities.hashtags').alias('hashtag'))
        .select(
        F.col('hashtag.text').alias('id'),
        F.col('hashtag.text').alias('name')
    )
        .dropDuplicates(['id'])
)
df_nodes_hashtags_tmp.head(5)

[Row(id='040fungi', name='040fungi'),
 Row(id='11stedenMcDrive', name='11stedenMcDrive'),
 Row(id='140ktober', name='140ktober'),
 Row(id='150km', name='150km'),
 Row(id='1dagniet', name='1dagniet')]

In [8]:
df_nodes_tweets = df.select(
    F.col('id_str').alias('id'),
    F.col('text').alias('name'),
    'label_social_distancing',
    F.col('probabilities_social_distancing.SUPPORTS').alias('feat_supports'),
    F.col('probabilities_social_distancing.IRRELEVANT').alias('feat_irrelevant'),
    F.col('probabilities_social_distancing.REJECTS').alias('feat_rejects'),
    F.col('sentiment').alias('feat_sentiment'),
    F.col('created_at').alias('timestamp_from')
).cache()
df_nodes_tweets.head(5)

[Row(id='1311425852169744385', name='@NUnl Waarom niet de dove mens het mondkapje op en degene die spreekt geen monkapje. Als één van twee het maar op h… https://t.co/EA6OO7cWR5', label_social_distancing='SUPPORTS', feat_supports=0.9821312427520752, feat_irrelevant=0.014853371307253838, feat_rejects=0.003045313758775592, feat_sentiment=-0.6, timestamp_from=1601496062),
 Row(id='1311426330488180736', name='hoe meer werklozen er komen. Hoe meer bedrijven kapot gaan, en zo kan ik nog uren doorgaan. \n\nBuiten dit om, ik sna… https://t.co/8LGwaSPock', label_social_distancing='SUPPORTS', feat_supports=0.9024366736412048, feat_irrelevant=8.047425944823772e-05, feat_rejects=0.09751284122467041, feat_sentiment=-0.6, timestamp_from=1601496176),
 Row(id='1311426383906844672', name='RT @Rijksoverheid: Dringend advies: draag vanaf vandaag een niet-medisch mondkapje in publieke binnenruimtes zoals winkels, musea en benzin…', label_social_distancing='SUPPORTS', feat_supports=1.0000100135803223, feat

# Edge Extraction

In [9]:
df_node_ids = (
    df_nodes_users_tmp.select('id')
        .union(df_nodes_hashtags_tmp.select('id'))
        .union(df_nodes_tweets.select('id'))
        .dropDuplicates(['id'])
).cache()


def filter_node_ids(df):
    return df.join(
        df_node_ids,
        F.col('src') == F.col('id'),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col('id'),
        'inner'
    ).drop('id')

In [10]:
df_edges_tweeted = filter_node_ids(df.select(
    F.col('user.id_str').alias('src'),
    F.col('id_str').alias('dst'),
    F.col('created_at').alias('timestamp')
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter(
    "dst != '' AND src != ''"
).distinct())

df_edges_tweeted.head(5)
# write_relations(tweeted_ref_df, "TWEETED", ":Twitter:User.id_str:user_id", ":Twitter:Tweet.id_str:tweet_id")

                                                                                

[Row(src='3826781607', dst='1311442489140826113', timestamp=1601500029),
 Row(src='1241825347357872129', dst='1311471885868244992', timestamp=1601507038),
 Row(src='1276939958255849472', dst='1311520686066860034', timestamp=1601518673),
 Row(src='1189609040348889089', dst='1311532547059388416', timestamp=1601521500),
 Row(src='979305110', dst='1311565260529307648', timestamp=1601529300)]

In [11]:
df_edges_reply_to_user = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('in_reply_to_user_id_str').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_reply_to_user.head(5)
# write_relations(reply_ref_df, "REPLY_TO", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:User.id_str:user_id")

[Row(src='1311562753329573888', dst='18373140', timestamp=1601528702),
 Row(src='1311607963124150274', dst='3103641', timestamp=1601539481),
 Row(src='1311622804429852672', dst='50771270', timestamp=1601543020),
 Row(src='1311652285987074057', dst='169491437', timestamp=1601550048),
 Row(src='1311886507079327744', dst='466779187', timestamp=1601605891)]

In [12]:
df_edges_quote_tweet = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('quoted_status_id_str').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_quote_tweet.head(5)
# write_relations(quote_tweet_df, "QUOTED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Tweet.id_str:quoted_tweet_id")

[Row(src='1311672846712811522', dst='1311650850654355462', timestamp=1601554951),
 Row(src='1311644863012769792', dst='1311635271650340870', timestamp=1601548279),
 Row(src='1311956349698351105', dst='1311951954172575744', timestamp=1601622543),
 Row(src='1311826032094334976', dst='1311726735638114306', timestamp=1601591473),
 Row(src='1311763002002153472', dst='1311755749975089167', timestamp=1601576445)]

In [13]:
df_edges_mentioned_user = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.explode('entities.user_mentions.id_str').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_mentioned_user.head(5)
# write_relations(mention_user_ref_df, "MENTIONED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:User.id_str:user_id")

[Row(src='1311473035531169792', dst='187437575', timestamp=1601507312),
 Row(src='1311562753329573888', dst='18373140', timestamp=1601528702),
 Row(src='1311574363926335491', dst='101723649', timestamp=1601531470),
 Row(src='1311601203093090304', dst='171084541', timestamp=1601537869),
 Row(src='1311607568800927744', dst='15581273', timestamp=1601539387)]

In [14]:
df_edges_reply_to_tweet = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.col('in_reply_to_status_id_str').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_reply_to_tweet.head(5)
# write_relations(reply_tweet_df, "REPLY_TO", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Tweet.id_str:original_tweet_id")

[Row(src='1311672126659756033', dst='1311589634108002304', timestamp=1601554779),
 Row(src='1311676116520837120', dst='1311662390761586688', timestamp=1601555730),
 Row(src='1311771982661328896', dst='1311627253466501120', timestamp=1601578586),
 Row(src='1311565846666514432', dst='1311515283455180800', timestamp=1601529440),
 Row(src='1311762698007465986', dst='1311733426979049474', timestamp=1601576373)]

In [15]:
df_edges_mentioned_hashtag = filter_node_ids(df.select(
    F.col('id_str').alias('src'),
    F.explode('entities.hashtags.text').alias('dst'),
    F.col('created_at').alias('timestamp'),
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_mentioned_hashtag.head(5)
# write_relations(hashtag_mention_df, "MENTIONED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Hashtag.id_str:hashtag_id")

[Row(src='1311434772665847809', dst='schimmel', timestamp=1601498189),
 Row(src='1311579492163506177', dst='mondkapjes', timestamp=1601532693),
 Row(src='1311586415541735425', dst='winkels', timestamp=1601534344),
 Row(src='1311939874392616960', dst='corona', timestamp=1601618615),
 Row(src='1311434956934176768', dst='anderhalvemeter', timestamp=1601498233)]

In [16]:
df_user_ids = df_nodes_users_tmp.select('id')

df_followers = (
    spark.read.text(DS.raw_str('followers'), wholetext=False, pathGlobFilter='*.txt')
        .select(
        F.regexp_extract(F.input_file_name(), r'([0-9]+)%20([A-z0-9%]+).txt$', 1).alias('user_id'),
        F.col('value').alias('follower_id'),
    )
).cache()

df_followers.head(5)

                                                                                

[Row(user_id='56377143', follower_id='2687493770'),
 Row(user_id='56377143', follower_id='1443138398928183298'),
 Row(user_id='56377143', follower_id='480680728'),
 Row(user_id='56377143', follower_id='801927938304315392'),
 Row(user_id='56377143', follower_id='279492619')]

In [17]:
df_edges_follows = filter_node_ids(df_followers.join(
    df_user_ids.alias('a'), F.col('follower_id') == F.col('a.id'), 'inner'
).join(
    df_user_ids.alias('b'), F.col('user_id') == F.col('b.id'), 'inner'
).select(
    F.col('follower_id').alias('src'),
    F.col('user_id').alias('dst')
).filter(
    F.col('src').isNotNull() &
    F.col('dst').isNotNull()
).filter("dst != '' AND src != ''").distinct())

df_edges_follows.head(5)
# write_relations(df_followers, "FOLLOWS", ":Twitter:User.id_str:follower_id", ":Twitter:User.id_str:user_id")

                                                                                

[Row(src='1034004099950362625', dst='56377143'),
 Row(src='714542887040630784', dst='56377143'),
 Row(src='400296650', dst='56377143'),
 Row(src='2576050703', dst='56377143'),
 Row(src='138394987', dst='56377143')]

# Feature Engineering

In [18]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer
import nltk

nltk.download('stopwords')

df_kw_clean = df_nodes_tweets.select('id', (F.lower(F.regexp_replace('name', "[^a-zA-Z\\s]", "")).alias('text')))

tokenizer = Tokenizer(inputCol='text', outputCol='words_token')
df_kw_tokens = tokenizer.transform(df_kw_clean).select('id', 'words_token')

stopwordList = nltk.corpus.stopwords.words('dutch')
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean', stopWords=stopwordList)
df_kw_nostopw = remover.transform(df_kw_tokens).select('id', 'words_clean')

# Stem text
stemmer = SnowballStemmer(language='dutch')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], T.ArrayType(T.StringType()))
df_kw_stemmed = df_kw_nostopw.withColumn("words_stemmed", stemmer_udf("words_clean")).select('id', 'words_stemmed')

filter_length_udf = F.udf(lambda row: [x for x in row if len(x) >= 3], T.ArrayType(T.StringType()))
df_kw_doc = df_kw_stemmed.select('id', F.array_distinct(filter_length_udf(F.col('words_stemmed'))).alias('keywords'))

df_kw_doc.head(10)

[nltk_data] Downloading package stopwords to /home/egordm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
                                                                                

[Row(id='1311425852169744385', keywords=['nunl', 'waarom', 'dov', 'men', 'mondkapj', 'deg', 'spreekt', 'monkapj', 'twee', 'httpstcoeaoocwr']),
 Row(id='1311426330488180736', keywords=['werkloz', 'kom', 'bedrijv', 'kapot', 'gan', 'uren', 'doorgan', 'buit', 'sna', 'httpstcolgwaspock']),
 Row(id='1311426383906844672', keywords=['rijksover', 'dringend', 'advies', 'drag', 'vanaf', 'vandag', 'nietmedisch', 'mondkapj', 'publiek', 'binnenruimtes', 'zoal', 'winkel', 'musea', 'benzin']),
 Row(id='1311426584893624322', keywords=['natassavas', 'georg', 'soros', 'bocht', 'pompt', 'even', 'ker', 'rac', 'nieuw', 'ovj', 'los', 'angeles', 'uiteraard']),
 Row(id='1311426630020202498', keywords=['zuidplas', 'dringend', 'advies', 'drag', 'vanaf', 'vandag', 'nietmedisch', 'mondkapj', 'publiek', 'binnenruimtes', 'zoal', 'winkel', 'musea', 'benzinestat']),
 Row(id='1311426978390716417', keywords=['vrgroning', 'dringend', 'advies', 'drag', 'vanaf', 'vandag', 'nietmedisch', 'mondkapj', 'publiek', 'binnenruimte

In [19]:
df_kw = df_kw_doc\
    .select(F.explode('keywords').alias('keyword'))\
    .groupBy('keyword')\
    .count()\
    .orderBy('count', ascending=False)

df_kw.show(10)



+----------+-----+
|   keyword|count|
+----------+-----+
|   afstand|17514|
|     meter|12283|
|      houd|12134|
|  mondkapj| 5150|
|      mens| 4960|
|      drag| 3710|
| anderhalv| 3465|
|       wel| 3335|
|      hand| 3198|
|mondkapjes| 2970|
+----------+-----+
only showing top 10 rows



                                                                                

In [20]:
k = 80
top_keywords = {x.keyword for x in df_kw.limit(k).select('keyword').collect()}
print(top_keywords)



{'minpres', 'denk', 'allen', 'horeca', 'publiek', 'iederen', 'tuss', 'houdt', 'onz', 'wij', 'echt', 'winkel', 'waarom', 'afstand', 'zoal', 'mens', 'hou', 'schol', 'gewon', 'wer', 'vanaf', 'mag', 'drag', 'mondmasker', 'vandag', 'moet', 'advies', 'stuk', 'buit', 'gan', 'lat', 'houd', 'all', 'blijf', 'mondkapjes', 'binn', 'volgen', 'verspreid', 'wel', 'covid', 'nos', 'kom', 'zorg', 'masker', 'meter', 'maatregel', 'hand', 'mondkapj', 'goed', 'mogelijk', 'even', 'elkar', 'thuis', 'wet', 'mee', 'mak', 'zie', 'ander', 'zit', 'wass', 'blijv', 'dringend', 'rivm', 'grot', 'sted', 'regel', 'wek', 'werk', 'binnenruimtes', 'lockdown', 'musea', 'test', 'jij', 'besmet', 'net', 'gat', 'corona', 'hel', 'anderhalv', 'war'}


                                                                                

In [21]:
filter_freq_udf = F.udf(lambda row: [kw for kw in row if kw in top_keywords], T.ArrayType(T.StringType()))
df_freq_kw_doc = df_kw_doc.select('id', F.explode(filter_freq_udf(F.col('keywords'))).alias('keyword')).cache()
df_freq_kw_doc.head(5)

[Row(id='1311425852169744385', keyword='waarom'),
 Row(id='1311425852169744385', keyword='mondkapj'),
 Row(id='1311426330488180736', keyword='kom'),
 Row(id='1311426330488180736', keyword='gan'),
 Row(id='1311426330488180736', keyword='buit')]

In [22]:
df_hashtag_feats = df_edges_mentioned_hashtag.select('src', 'dst')\
    .join(df_freq_kw_doc.alias('a'), F.col('src') == F.col('a.id'), 'inner')\
    .groupby('dst')\
    .agg(F.collect_list('keyword').alias('keywords'))\
    .select([F.col('dst').alias('hashtag_id')] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k))
        for k in top_keywords
    ])
df_hashtag_feats.head(5)

22/04/06 00:52:16 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

[Row(hashtag_id='040fungi', feat_minpres=False, feat_denk=False, feat_allen=False, feat_horeca=False, feat_publiek=False, feat_iederen=False, feat_tuss=False, feat_houdt=False, feat_onz=False, feat_wij=False, feat_echt=False, feat_winkel=False, feat_waarom=False, feat_afstand=False, feat_zoal=False, feat_mens=False, feat_hou=False, feat_schol=False, feat_gewon=False, feat_wer=False, feat_vanaf=False, feat_mag=False, feat_drag=False, feat_mondmasker=False, feat_vandag=False, feat_moet=True, feat_advies=False, feat_stuk=False, feat_buit=False, feat_gan=False, feat_lat=False, feat_houd=False, feat_all=False, feat_blijf=False, feat_mondkapjes=False, feat_binn=False, feat_volgen=False, feat_verspreid=False, feat_wel=False, feat_covid=False, feat_nos=False, feat_kom=False, feat_zorg=False, feat_masker=False, feat_meter=True, feat_maatregel=False, feat_hand=False, feat_mondkapj=False, feat_goed=False, feat_mogelijk=False, feat_even=False, feat_elkar=False, feat_thuis=False, feat_wet=False, fe

In [23]:
df_nodes_hashtags = df_nodes_hashtags_tmp\
    .join(df_hashtag_feats, F.col('id') == F.col('hashtag_id'), 'left').cache()
df_nodes_hashtags.head(5)

                                                                                

[Row(id='Triviant', name='Triviant', hashtag_id='Triviant', feat_minpres=False, feat_denk=False, feat_allen=False, feat_horeca=False, feat_publiek=False, feat_iederen=False, feat_tuss=False, feat_houdt=False, feat_onz=False, feat_wij=False, feat_echt=False, feat_winkel=False, feat_waarom=False, feat_afstand=False, feat_zoal=False, feat_mens=False, feat_hou=False, feat_schol=False, feat_gewon=False, feat_wer=False, feat_vanaf=False, feat_mag=False, feat_drag=False, feat_mondmasker=False, feat_vandag=False, feat_moet=False, feat_advies=False, feat_stuk=False, feat_buit=False, feat_gan=False, feat_lat=False, feat_houd=False, feat_all=False, feat_blijf=False, feat_mondkapjes=False, feat_binn=False, feat_volgen=False, feat_verspreid=False, feat_wel=False, feat_covid=False, feat_nos=False, feat_kom=False, feat_zorg=False, feat_masker=False, feat_meter=True, feat_maatregel=False, feat_hand=False, feat_mondkapj=False, feat_goed=False, feat_mogelijk=False, feat_even=False, feat_elkar=False, fea

In [24]:
df_user_feats = df_edges_tweeted.select('src', 'dst')\
    .join(df_freq_kw_doc.alias('a'), F.col('dst') == F.col('a.id'), 'inner')\
    .groupby('src')\
    .agg(F.collect_list('keyword').alias('keywords'))\
    .select([F.col('src').alias('user_id')] + [
        F.array_contains('keywords', F.lit(k)).alias('feat_' + str(k))
        for k in top_keywords
    ])
df_user_feats.head(5)

[Row(user_id='100002919', feat_minpres=False, feat_denk=False, feat_allen=True, feat_horeca=True, feat_publiek=False, feat_iederen=False, feat_tuss=True, feat_houdt=True, feat_onz=False, feat_wij=False, feat_echt=False, feat_winkel=False, feat_waarom=False, feat_afstand=False, feat_zoal=False, feat_mens=True, feat_hou=False, feat_schol=False, feat_gewon=False, feat_wer=False, feat_vanaf=False, feat_mag=False, feat_drag=False, feat_mondmasker=False, feat_vandag=False, feat_moet=False, feat_advies=False, feat_stuk=False, feat_buit=False, feat_gan=False, feat_lat=False, feat_houd=False, feat_all=False, feat_blijf=False, feat_mondkapjes=False, feat_binn=False, feat_volgen=False, feat_verspreid=False, feat_wel=False, feat_covid=False, feat_nos=False, feat_kom=False, feat_zorg=False, feat_masker=False, feat_meter=True, feat_maatregel=False, feat_hand=False, feat_mondkapj=False, feat_goed=False, feat_mogelijk=False, feat_even=False, feat_elkar=False, feat_thuis=False, feat_wet=False, feat_mee

In [25]:
df_nodes_users = df_nodes_users_tmp\
    .join(df_user_feats, F.col('id') == F.col('user_id'), 'left').cache()
df_nodes_users.head(5)

                                                                                

[Row(name='☾fey⁷ JIMIN DAAY', screen_name='cherrykissjoon', id='1001361715907514368', user_id=None, feat_minpres=None, feat_denk=None, feat_allen=None, feat_horeca=None, feat_publiek=None, feat_iederen=None, feat_tuss=None, feat_houdt=None, feat_onz=None, feat_wij=None, feat_echt=None, feat_winkel=None, feat_waarom=None, feat_afstand=None, feat_zoal=None, feat_mens=None, feat_hou=None, feat_schol=None, feat_gewon=None, feat_wer=None, feat_vanaf=None, feat_mag=None, feat_drag=None, feat_mondmasker=None, feat_vandag=None, feat_moet=None, feat_advies=None, feat_stuk=None, feat_buit=None, feat_gan=None, feat_lat=None, feat_houd=None, feat_all=None, feat_blijf=None, feat_mondkapjes=None, feat_binn=None, feat_volgen=None, feat_verspreid=None, feat_wel=None, feat_covid=None, feat_nos=None, feat_kom=None, feat_zorg=None, feat_masker=None, feat_meter=None, feat_maatregel=None, feat_hand=None, feat_mondkapj=None, feat_goed=None, feat_mogelijk=None, feat_even=None, feat_elkar=None, feat_thuis=Non

# Renumbering the nodes

In [26]:
df_nodes_users_new = (
    df_nodes_users
        .withColumnRenamed('id', 'tid')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
).cache()
df_nodes_users_new.show(5)

df_nodes_hashtags_new = (
    df_nodes_hashtags
        .withColumnRenamed('id', 'tid')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
).cache()
df_nodes_hashtags_new.show(5)

df_nodes_tweets_new = (
    df_nodes_tweets
        .withColumnRenamed('id', 'tid')
        .coalesce(1)
        .withColumn('id', F.monotonically_increasing_id())
).cache()
df_nodes_tweets_new.show(5)

                                                                                

+--------------------+--------------+-------------------+-------------------+------------+---------+----------+-----------+------------+------------+---------+----------+--------+--------+---------+-----------+-----------+------------+---------+---------+--------+----------+----------+--------+----------+--------+---------+---------------+-----------+---------+-----------+---------+---------+--------+--------+---------+--------+----------+---------------+---------+-----------+--------------+--------+----------+--------+--------+---------+-----------+----------+--------------+---------+-------------+---------+-------------+---------+----------+----------+--------+--------+--------+--------+----------+--------+---------+----------+-------------+---------+---------+---------+----------+--------+---------+------------------+-------------+----------+---------+--------+-----------+--------+--------+-----------+--------+--------------+--------+---+
|                name|   screen_name|       

                                                                                

+--------------+--------------+--------------+------------+---------+----------+-----------+------------+------------+---------+----------+--------+--------+---------+-----------+-----------+------------+---------+---------+--------+----------+----------+--------+----------+--------+---------+---------------+-----------+---------+-----------+---------+---------+--------+--------+---------+--------+----------+---------------+---------+-----------+--------------+--------+----------+--------+--------+---------+-----------+----------+--------------+---------+-------------+---------+-------------+---------+----------+----------+--------+--------+--------+--------+----------+--------+---------+----------+-------------+---------+---------+---------+----------+--------+---------+------------------+-------------+----------+---------+--------+-----------+--------+--------+-----------+--------+--------------+--------+---+
|           tid|          name|    hashtag_id|feat_minpres|feat_denk|feat_a

In [27]:
def renumber_edge(df, a, b):
    return (
        df.alias('main')
            .join(a.alias('a'), F.col('src') == F.col('a.tid'), 'inner')
            .join(b.alias('b'), F.col('dst') == F.col('b.tid'), 'inner')
            .select('main.*', 'a.id', 'b.id')
            .withColumn('src', F.col('a.id'))
            .withColumn('dst', F.col('b.id'))
            .drop('id')
    )

In [35]:
df_edges_tweeted_new = renumber_edge(df_edges_tweeted, df_nodes_users_new, df_nodes_tweets_new)
df_edges_tweeted_new.show(5)
df_edges_reply_to_user_new = renumber_edge(df_edges_reply_to_user, df_nodes_tweets_new, df_nodes_users_new)
df_edges_reply_to_user_new.show(5)
df_edges_reply_to_tweet_new = renumber_edge(df_edges_reply_to_tweet, df_nodes_tweets_new, df_nodes_tweets_new)
df_edges_reply_to_tweet_new.show(5)
df_edges_quote_tweet_new = renumber_edge(df_edges_quote_tweet, df_nodes_tweets_new, df_nodes_tweets_new)
df_edges_quote_tweet_new.show(5)
df_edges_mentioned_user_new = renumber_edge(df_edges_mentioned_user, df_nodes_tweets_new, df_nodes_users_new)
df_edges_mentioned_user_new.show(5)
df_edges_mentioned_hashtag_new = renumber_edge(df_edges_mentioned_hashtag, df_nodes_tweets_new, df_nodes_hashtags_new)
df_edges_mentioned_hashtag_new.show(5)
df_edges_follows_new = renumber_edge(df_edges_follows, df_nodes_users_new, df_nodes_users_new)
df_edges_follows_new.show(5)

+-----+---+----------+
|  src|dst| timestamp|
+-----+---+----------+
|18200| 64|1601500029|
| 2409| 84|1601507038|
| 5329|119|1601518673|
|11245|152|1601521500|
|17186|418|1601529300|
+-----+---+----------+
only showing top 5 rows

+----+-----+----------+
| src|  dst| timestamp|
+----+-----+----------+
| 386|25510|1601528702|
| 886|26814|1601539481|
|1027|15879|1601543020|
|1309|25140|1601550048|
|2845|30732|1601605891|
+----+-----+----------+
only showing top 5 rows

+----+----+----------+
| src| dst| timestamp|
+----+----+----------+
|1506| 687|1601554779|
|1557|1421|1601555730|
|2547|1073|1601578586|
| 423| 112|1601529440|
|2474|2245|1601576373|
+----+----+----------+
only showing top 5 rows

+----+----+----------+
| src| dst| timestamp|
+----+----+----------+
|1519|1295|1601554951|
|1237|1146|1601548279|
|3420|3386|1601622543|
|2792|2176|1601591473|
|2475|2422|1601576445|
+----+----+----------+
only showing top 5 rows

+---+-----+----------+
|src|  dst| timestamp|
+---+-----+------



+-----+-----+
|  src|  dst|
+-----+-----+
|18084|25046|
| 7517|25046|
| 3696|25046|
| 3838|25046|
|21308|25046|
+-----+-----+
only showing top 5 rows



                                                                                

# Saving The Data

In [36]:
df_nodes_users_new.write.parquet(DS.processed_str('node__User'), mode='overwrite')
df_nodes_hashtags_new.write.parquet(DS.processed_str('node__Hashtag'), mode='overwrite')
df_nodes_tweets_new.write.parquet(DS.processed_str('node__Tweet'), mode='overwrite')

In [37]:
df_edges_tweeted_new.write.parquet(DS.processed_str('edge__User_TWEETED_Tweet'), mode='overwrite')
df_edges_reply_to_user_new.write.parquet(DS.processed_str('edge__Tweet_REPLIESTO_User'), mode='overwrite')
df_edges_reply_to_tweet_new.write.parquet(DS.processed_str('edge__Tweet_REPLIESTO_Tweet'), mode='overwrite')
df_edges_quote_tweet_new.write.parquet(DS.processed_str('edge__Tweet_QUOTES_Tweet'), mode='overwrite')
df_edges_mentioned_user_new.write.parquet(DS.processed_str('edge__Tweet_MENTIONS_User'), mode='overwrite')
df_edges_mentioned_hashtag_new.write.parquet(DS.processed_str('edge__Tweet_MENTIONS_Hashtag'), mode='overwrite')
df_edges_follows_new.write.parquet(DS.processed_str('edge__User_FOLLOWS_User'), mode='overwrite')

                                                                                