In [10]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [11]:
spark = (SparkSession.builder
         .appName("social-distancing-student")
         .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.12:4.1.0_for_spark_3')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("neo4j.url", "bolt://localhost:7687")
         .config("neo4j.authentication.type", "basic")
         .config("neo4j.authentication.basic.username", "neo4j")
         .config("neo4j.authentication.basic.password", "test")
         .getOrCreate())

In [22]:
def parse_entity(entity):
    entity, keys = entity.split('.')
    key, df_key = keys.split(':')
    return entity, key, df_key

def write_relations(df, relation, from_entity, to_entity):
    from_entity, from_key, from_df_key = parse_entity(from_entity)
    to_entity, to_key, to_df_key = parse_entity(to_entity)

    return (
        df.write.format("org.neo4j.spark.DataSource")
            .mode('overwrite')
            .option("batch.size", "20")
            .option("transaction.retries", "16")
            .option("transaction.retry.timeout", "100")
            .option("relationship", relation)
            .option("relationship.save.strategy", "keys")
            .option("relationship.source.labels", from_entity)
            .option("relationship.source.save.mode", "match")
            .option("relationship.source.node.keys", f"{from_df_key}:{from_key}")
            .option("relationship.target.labels", to_entity)
            .option("relationship.target.save.mode", "match")
            .option("relationship.target.node.keys", f"{to_df_key}:{to_key}")
            .save()
    )

def write_nodes(df, label, key):
    return (
        df.write.format("org.neo4j.spark.DataSource")
            .mode('overwrite')
            .option("labels", label)
            .option("node.keys", key)
            .option("schema.optimization.type", "NODE_CONSTRAINTS")
            .save()
    )

In [13]:
DATE_FORMAT = "EEE MMM dd HH:mm:ss '+0000' yyyy"

df = (
    spark.read.format('org.apache.spark.sql.json')
        .load("data/social-distancing-student/social-distancing-student.json")
        .withColumn('created_at', F.to_timestamp(F.col('created_at'), 'EEE MMM dd HH:mm:ss \'+0000\' yyyy'))
)
df.head(1)

                                                                                

[Row(created_at=datetime.datetime(2020, 9, 30, 22, 1, 2), entities=Row(hashtags=[], media=None, symbols=[], urls=[Row(display_url='twitter.com/i/web/status/1…', expanded_url='https://twitter.com/i/web/status/1311425852169744385', indices=[117, 140], url='https://t.co/EA6OO7cWR5')], user_mentions=[Row(id=2493701, id_str='2493701', indices=[0, 5], name='NU.nl', screen_name='NUnl')]), full_text='@NUnl Waarom niet de dove mens het mondkapje op en degene die spreekt geen monkapje. Als één van twee het maar op heeft en wel afstand houden. ik ben SH en heb erg last van mensen die te snel praten en te zacht. Wat doe ik hier aan. Gewoon maar vragen denk ik?', id_str='1311425852169744385', in_reply_to_status_id_str='1311352289928581120', in_reply_to_user_id_str='2493701', label_social_distancing='SUPPORTS', probabilities_social_distancing="{'SUPPORTS': 0.98213124, 'IRRELEVANT': 0.014853371, 'REJECTS': 0.0030453138}", quoted_status_id_str='', sentiment=-0.6, text='@NUnl Waarom niet de dove mens h

In [14]:
users_tweet_df = df.select('user.name', 'user.screen_name', 'user.id_str')
user_mentions_df = (df
                    .select(F.explode('entities.user_mentions').alias('user'))
                    .select('user.name', 'user.screen_name', 'user.id_str'))

users_df = users_tweet_df.union(user_mentions_df).dropDuplicates(['id_str'])

In [15]:
write_nodes(users_df, ":Twitter:User", "id_str")
users_df.head(5)

                                                                                

[Row(name='☾fey⁷ JIMIN DAAY', screen_name='cherrykissjoon', id_str='1001361715907514368'),
 Row(name='Jarno ▪︎ Merom', screen_name='RommeJarno', id_str='1006144373984251904'),
 Row(name='Jeroen Pepers 🇪🇺', screen_name='JeroenPepers', id_str='101008330'),
 Row(name='Test1', screen_name='Test137352311', id_str='1012078301299531777'),
 Row(name='A complicated mess 🙃', screen_name='TheJakNene', id_str='1030617542')]

In [16]:
hashtags_df = (df
    .select(F.explode('entities.hashtags').alias('hashtag'))
    .select(
        F.col('hashtag.text').alias('id_str'),
        F.col('hashtag.text').alias('name')
    )
   .dropDuplicates(['id_str'])
)

In [17]:
write_nodes(hashtags_df, ":Twitter:Hashtag", "id_str")
hashtags_df.head(5)

                                                                                

[Row(id_str='Triviant', name='Triviant'),
 Row(id_str='Twente', name='Twente'),
 Row(id_str='creativity', name='creativity'),
 Row(id_str='herfstvakantie', name='herfstvakantie'),
 Row(id_str='input', name='input')]

In [18]:
tweets_df = df.select(
    'id_str',
    'text',
    'label_social_distancing',
    'created_at'
)

In [19]:
write_nodes(tweets_df, ":Twitter:Tweet", "id_str")
tweets_df.head(5)

                                                                                

[Row(id_str='1311425852169744385', text='@NUnl Waarom niet de dove mens het mondkapje op en degene die spreekt geen monkapje. Als één van twee het maar op h… https://t.co/EA6OO7cWR5', label_social_distancing='SUPPORTS', created_at=datetime.datetime(2020, 9, 30, 22, 1, 2)),
 Row(id_str='1311426330488180736', text='hoe meer werklozen er komen. Hoe meer bedrijven kapot gaan, en zo kan ik nog uren doorgaan. \n\nBuiten dit om, ik sna… https://t.co/8LGwaSPock', label_social_distancing='SUPPORTS', created_at=datetime.datetime(2020, 9, 30, 22, 2, 56)),
 Row(id_str='1311426383906844672', text='RT @Rijksoverheid: Dringend advies: draag vanaf vandaag een niet-medisch mondkapje in publieke binnenruimtes zoals winkels, musea en benzin…', label_social_distancing='SUPPORTS', created_at=datetime.datetime(2020, 9, 30, 22, 3, 9)),
 Row(id_str='1311426584893624322', text='RT @natassavass: George Soros in de bocht! Pompt even nog een keer 1,5M in de race voor een nieuwe OvJ in Los Angeles. Uiteraard voor 

In [20]:
tweeted_ref_df = df.select(
    F.col('user.id_str').alias('user_id'),
    F.col('id_str').alias('tweet_id'),
    F.col('created_at').alias('created_at'),
).filter(
    F.col('user_id').isNotNull() &
    F.col('tweet_id').isNotNull()
)

In [23]:
write_relations(tweeted_ref_df, "TWEETED", ":Twitter:User.id_str:user_id", ":Twitter:Tweet.id_str:tweet_id")
tweeted_ref_df.head(5)

                                                                                

[Row(user_id='206319409', tweet_id='1311425852169744385', created_at=datetime.datetime(2020, 9, 30, 22, 1, 2)),
 Row(user_id='2898332572', tweet_id='1311426330488180736', created_at=datetime.datetime(2020, 9, 30, 22, 2, 56)),
 Row(user_id='1353842360', tweet_id='1311426383906844672', created_at=datetime.datetime(2020, 9, 30, 22, 3, 9)),
 Row(user_id='1092080469599023105', tweet_id='1311426584893624322', created_at=datetime.datetime(2020, 9, 30, 22, 3, 57)),
 Row(user_id='369453053', tweet_id='1311426630020202498', created_at=datetime.datetime(2020, 9, 30, 22, 4, 8))]

In [24]:
reply_ref_df = df.select(
    F.col('in_reply_to_user_id_str').alias('user_id'),
    F.col('id_str').alias('tweet_id'),
    F.col('created_at').alias('created_at'),
).filter(
    F.col('user_id').isNotNull() &
    F.col('tweet_id').isNotNull()
)

In [26]:
write_relations(reply_ref_df, "REPLY_TO", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:User.id_str:user_id")
reply_ref_df.head(5)

                                                                                

[Row(user_id='2493701', tweet_id='1311425852169744385', created_at=datetime.datetime(2020, 9, 30, 22, 1, 2)),
 Row(user_id='2898332572', tweet_id='1311426330488180736', created_at=datetime.datetime(2020, 9, 30, 22, 2, 56)),
 Row(user_id='15200788', tweet_id='1311427312328609797', created_at=datetime.datetime(2020, 9, 30, 22, 6, 51)),
 Row(user_id='1180949770598322176', tweet_id='1311428477023903744', created_at=datetime.datetime(2020, 9, 30, 22, 11, 28)),
 Row(user_id='236157669', tweet_id='1311428764589584384', created_at=datetime.datetime(2020, 9, 30, 22, 12, 37))]

In [27]:
reply_tweet_df = df.select(
    F.col('in_reply_to_status_id_str').alias('original_tweet_id'),
    F.col('id_str').alias('tweet_id'),
    F.col('created_at').alias('created_at'),
).filter(
    F.col('original_tweet_id').isNotNull() &
    F.col('tweet_id').isNotNull()
)

In [28]:
write_relations(reply_tweet_df, "REPLY_TO", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Tweet.id_str:original_tweet_id")
reply_ref_df.head(5)

                                                                                

[Row(user_id='2493701', tweet_id='1311425852169744385', created_at=datetime.datetime(2020, 9, 30, 22, 1, 2)),
 Row(user_id='2898332572', tweet_id='1311426330488180736', created_at=datetime.datetime(2020, 9, 30, 22, 2, 56)),
 Row(user_id='15200788', tweet_id='1311427312328609797', created_at=datetime.datetime(2020, 9, 30, 22, 6, 51)),
 Row(user_id='1180949770598322176', tweet_id='1311428477023903744', created_at=datetime.datetime(2020, 9, 30, 22, 11, 28)),
 Row(user_id='236157669', tweet_id='1311428764589584384', created_at=datetime.datetime(2020, 9, 30, 22, 12, 37))]

In [29]:
quote_tweet_df = df.select(
    F.col('quoted_status_id_str').alias('quoted_tweet_id'),
    F.col('id_str').alias('tweet_id'),
    F.col('created_at').alias('created_at'),
).filter(
    F.col('quoted_tweet_id').isNotNull() &
    F.col('tweet_id').isNotNull()
)

In [30]:
write_relations(quote_tweet_df, "QUOTED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Tweet.id_str:quoted_tweet_id")
quote_tweet_df.head(5)

                                                                                

[Row(quoted_tweet_id='', tweet_id='1311425852169744385', created_at=datetime.datetime(2020, 9, 30, 22, 1, 2)),
 Row(quoted_tweet_id='', tweet_id='1311426330488180736', created_at=datetime.datetime(2020, 9, 30, 22, 2, 56)),
 Row(quoted_tweet_id='', tweet_id='1311426383906844672', created_at=datetime.datetime(2020, 9, 30, 22, 3, 9)),
 Row(quoted_tweet_id='', tweet_id='1311426584893624322', created_at=datetime.datetime(2020, 9, 30, 22, 3, 57)),
 Row(quoted_tweet_id='', tweet_id='1311426630020202498', created_at=datetime.datetime(2020, 9, 30, 22, 4, 8))]

In [31]:
mention_user_ref_df = (df.select(
    F.col('id_str').alias('tweet_id'),
    F.explode('entities.user_mentions.id_str').alias('user_id'),
    F.col('created_at').alias('created_at'),
))

In [32]:
write_relations(mention_user_ref_df, "MENTIONED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:User.id_str:user_id")
mention_user_ref_df.head(5)

                                                                                

[Row(tweet_id='1311425852169744385', user_id='2493701', created_at=datetime.datetime(2020, 9, 30, 22, 1, 2)),
 Row(tweet_id='1311426383906844672', user_id='15595333', created_at=datetime.datetime(2020, 9, 30, 22, 3, 9)),
 Row(tweet_id='1311426584893624322', user_id='1160220112789676032', created_at=datetime.datetime(2020, 9, 30, 22, 3, 57)),
 Row(tweet_id='1311426630020202498', user_id='87552781', created_at=datetime.datetime(2020, 9, 30, 22, 4, 8)),
 Row(tweet_id='1311426978390716417', user_id='223792086', created_at=datetime.datetime(2020, 9, 30, 22, 5, 31))]

In [33]:
hashtag_mention_df = (df.select(
    F.col('id_str').alias('tweet_id'),
    F.explode('entities.hashtags.text').alias('hashtag_id'),
    F.col('created_at').alias('created_at'),
))

In [34]:
write_relations(hashtag_mention_df, "MENTIONED", ":Twitter:Tweet.id_str:tweet_id", ":Twitter:Hashtag.id_str:hashtag_id")
hashtag_mention_df.head(5)

                                                                                

[Row(tweet_id='1311433312867622913', hashtag_id='inclusiefsporten', created_at=datetime.datetime(2020, 9, 30, 22, 30, 41)),
 Row(tweet_id='1311433495038889984', hashtag_id='inclusiefsporten', created_at=datetime.datetime(2020, 9, 30, 22, 31, 25)),
 Row(tweet_id='1311434772665847809', hashtag_id='mondkapjes', created_at=datetime.datetime(2020, 9, 30, 22, 36, 29)),
 Row(tweet_id='1311434772665847809', hashtag_id='schimmel', created_at=datetime.datetime(2020, 9, 30, 22, 36, 29)),
 Row(tweet_id='1311434772665847809', hashtag_id='infectie', created_at=datetime.datetime(2020, 9, 30, 22, 36, 29))]

In [35]:
df_user_ids = users_df.select('id_str')

df_followers = (
    spark.read.text("data/social-distancing-student/followers", wholetext=False, pathGlobFilter='*.txt')
        .select(
            F.regexp_extract(F.input_file_name(), r'([0-9]+)%20([A-z0-9%]+).txt$', 1).alias('user_id'),
            F.col('value').alias('follower_id'),
        )
)

df_followers = df_followers\
    .join(df_user_ids.alias('a'), F.col('follower_id') == F.col('a.id_str'), 'inner') \
    .join(df_user_ids.alias('b'), F.col('user_id') == F.col('b.id_str'), 'inner') \
    .select('user_id', 'follower_id')

In [36]:
write_relations(df_followers, "FOLLOWS", ":Twitter:User.id_str:follower_id", ":Twitter:User.id_str:user_id")
df_followers.head(5)

                                                                                

[Row(user_id='56377143', follower_id='3055340368'),
 Row(user_id='56377143', follower_id='62783041'),
 Row(user_id='56377143', follower_id='409104261'),
 Row(user_id='56377143', follower_id='374513273'),
 Row(user_id='56377143', follower_id='146872603')]