In [3]:
from py2neo import Graph
import os

graph = Graph(password=os.environ.get('NEO4J_PW'))


def create_rel_to_parent(child_id, parent_id, timestamp, graph, via):
    graph.run(
        "MATCH (child:User {user_id: {child_id}}), (parent:User {user_id: {parent_id}}) " +
        "CREATE UNIQUE (child)-[:ADOPTED {timestamp: {timestamp}, via: {via}}]->(parent)",
        child_id=child_id, parent_id=parent_id, timestamp=timestamp, via=via
    )
    # print("created: ", child_id, " --> ", parent_id)


def assign_parent(user_id):

    first_tweet_props = graph.run(
        "MATCH (:User {user_id: {user_id}})-[:TWEETED]->(tweet) " +
        "OPTIONAL MATCH (tweet)-[:NATIVELY_RETWEETS]->(original_tweet)<-[:TWEETED]-(retweeted_user:User) " +
        "OPTIONAL MATCH (tweet)-[:QUOTES]->(original_tweet)<-[:TWEETED]-(quoted_user:User) " +
        "OPTIONAL MATCH (tweet)-[:REPLIES]->(original_tweet)<-[:TWEETED]-(replied_to_user:User) "
        "RETURN tweet.created_at, original_tweet.tweet_id, retweeted_user.user_id, quoted_user.user_id, replied_to_user.user_id " +
        "ORDER BY tweet.created_at ASC LIMIT 1",
        user_id=user_id
    ).data()

    ts = first_tweet_props[0]['tweet.created_at']
    original_tweet_id = first_tweet_props[0]['original_tweet.tweet_id']
    retweeted_user_id = first_tweet_props[0]['retweeted_user.user_id']
    quoted_user_id = first_tweet_props[0]['quoted_user.user_id']
    replied_to_user_id = first_tweet_props[0]['replied_to_user.user_id']
    
    # print(first_tweet_props)

    friends = graph.run(
        "MATCH (:User {user_id: {user_id}})-[f:FOLLOWS]->(u) " +
        "WHERE f.followed_after < {timestamp} " +
        "RETURN u.user_id",
        user_id=user_id, timestamp=ts
    )

    friend_ids = [friend['u.user_id'] for friend in friends]
    
    if retweeted_user_id is not None:
        if retweeted_user_id in friend_ids:
            create_rel_to_parent(user_id, retweeted_user_id, ts, graph, via='retweeted tweet of friend')
        else:  # if retweeted user is not followed look whether this user was retweeted by somebody who the focal user follows
            middleman = graph.evaluate(
                "OPTIONAL MATCH (retweeted_user:User {user_id: {retweeted_user_id}})-[:TWEETED]->(ot:Tweet)<-[:NATIVELY_RETWEETS]-(t:Tweet)<-[:TWEETED]-(retweeter:User) " +
                "WHERE t.created_at < {timestamp} AND ot.tweet_id = {original_tweet_id} AND retweeter.user_id IN {friend_ids} " +
                "RETURN retweeter.user_id ORDER BY t.timestamp DESC LIMIT 1",
                retweeted_user_id=retweeted_user_id, timestamp=ts, friend_ids=friend_ids, original_tweet_id=original_tweet_id
            )
            if middleman is not None:
                create_rel_to_parent(user_id, middleman, ts, graph, via='retweeted retweet by friend')
            else:  # if no middleman can be found, the parent is the retweeted user
                create_rel_to_parent(user_id, retweeted_user_id, ts, graph, via='retweeted tweet not by friend')
    elif quoted_user_id is not None:
        # almost same procedure as for retweets
        if quoted_user_id in friend_ids:
            create_rel_to_parent(user_id, quoted_user_id, ts, graph, via='quoted tweet by friend')
        else:
            middleman = graph.evaluate(
                "OPTIONAL MATCH (quoted_user:User {user_id: {quoted_user_id}})-[:TWEETED]->(ot:Tweet)<-[:NATIVELY_RETWEETS]-(t:Tweet)<-[:TWEETED]-(retweeter:User) " +
                "WHERE t.created_at < {timestamp} AND ot.tweet_id = {original_tweet_id} AND retweeter.user_id IN {friend_ids} " +
                "RETURN retweeter.user_id ORDER BY t.timestamp DESC LIMIT 1",
                quoted_user_id=quoted_user_id, timestamp=ts, friend_ids=friend_ids, original_tweet_id=original_tweet_id
            )
            if middleman is not None:
                create_rel_to_parent(user_id, middleman, ts, graph, via='quoted retweet by friend')
            else:  # if no middleman can be found, the parent is the quoted user
                create_rel_to_parent(user_id, quoted_user_id, ts, graph, via='quoted tweet not by friend')
    elif replied_to_user_id is not None:
        source_tweet = graph.evaluate(
            "OPTIONAL MATCH (:User {user_id: {replied_to_user_id}})-[:TWEETED]->(t) " +
            # explicitly exclude tweets not in the dataset which will have NULL for timestamp
            "WHERE t.created_at IS NOT NULL AND t.created_at < {timestamp} " +
            "RETURN t.created_at ORDER BY t.created_at DESC LIMIT 1",
            replied_to_user_id=replied_to_user_id, timestamp=ts
        )
        if source_tweet is not None:  # test whether mentioned user posted link already

            if replied_to_user_id in friend_ids:
                create_rel_to_parent(user_id, replied_to_user_id, ts, graph, via='replied to tweet by friend')

            else:  # test for retweets/quotes of/replies to mentioned user in timeline of focal user
                middleman = graph.evaluate(
                    "OPTIONAL MATCH (replied_to_user:User {user_id: {replied_to_user_id}})-[:TWEETED]->(ot:Tweet)<--(t:Tweet)<-[:TWEETED]-(middleman:User)) " +
                    "WHERE t.created_at < {timestamp} AND ot.tweet_id = {original_tweet_id} AND middleman.user_id IN {friend_ids} " +
                    "RETURN middleman.user_id ORDER BY t.created_at DESC LIMIT 1",
                    replied_to_user_id=replied_to_user_id, timestamp=ts, friend_ids=friend_ids, original_tweet_id=original_tweet_id
                )
                if middleman is not None:
                    create_rel_to_parent(user_id, middleman, ts, graph, via='replied to tweet via friend')
    else:
        last_active_friend_id = graph.evaluate(
            "OPTIONAL MATCH (t:Tweet)<-[:TWEETED]-(u) " +
            "WHERE t.timestamp < {timestamp} AND u.user_name IN {friend_ids} " +
            # excludes tweets without a timestamp, i.e. tweets that have not been retrieved by the search, i.e. most likely w/o the link
            "RETURN u.user_name ORDER BY t.timestamp DESC LIMIT 1",
            timestamp=ts, friend_ids=friend_ids
        )
        if last_active_friend_id is not None:
            create_rel_to_parent(user_id, last_active_friend_id, ts, graph, via='last tweet seen with link')

    return user_id

In [None]:
%%time

userstream = graph.run(
    "MATCH (t:Tweet) " +
    "WHERE t.created_at > 0 " +
    "WITH t, t.created_at as ts ORDER BY ts ASC " +
    "MATCH (t:Tweet)<-[:TWEETED]-(u:User) " +
    "RETURN DISTINCT u.user_id")

i = 0

for user in userstream:
    # print(user['u.user_id'])
    assign_parent(user['u.user_id'])
    i += 1
    if i % 50 == 0:
        print(i)

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
6050
6100
6150
6200
6250
6300
6350
6400
6450
6500
6550
6600
6650
6700
6750
6800
6850
6900
6950
7000
7050
7100
7150
7200
7250
7300
7350
7400
7450
7500
7550
7600
7650
7700
7750
7800
7850
7900
7950
8000
8050
8100
8150
8200
8250
8300
8350
8400
8450
8500
8550
8600
8650
8700
8750
8800
8850
8900
8950
9000
9050
9100
9150
9200
9250
9300
9350
9400
9450
9500
9550
9600
9650
9700
9750
9800
9850
9900
9950
10000
10050
10100
10150
1

Speedup via multiprocessing is possible but apparently the new `run` method by py2neo seems not to be working well with it. For larger datasets refactor, maybe with official db driver by Neo4j.

In [6]:
print(i)

51047
