# Notebook to curate data for twitter flower

## Curate flower using data in ES database
To curate a flower:
- 1 use the cells at the bottom of this section to generate data for the flower with a given twitter handle.
- 2 paste the following line of code to the first line of the processdata function in core/flower/high_level_get_flower.py:
> import networkx as nx
> egoG = nx.read_gpickle("scripts/twitter_flower.gpickle")
- 3 Create any flower on using a website hosted from repo

In [1]:
import json, re, csv
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import numpy as np
import networkx as nx
from datetime import datetime
import pandas as pd

client = Elasticsearch("130.56.248.35:9200")

In [2]:
topic_index = "es.mmkg-doc-cst04-*"
user_name = "AndrewYNg"
user_id = "user-twitter-216939636"


# AI: es.mmkg-doc-cst04-*
# AusPol: es.mmkg-doc-aus01-*
# User: es.mmkg-user-*

In [2]:
# get twitter id by name
def get_id(name):
    query = {
        "_source": ["_id"],
        "query": {"match":{"screen_name": name}}
    }
    
    s = Search(using=client, index="es.mmkg-user-*") # user indices
    s.update_from_dict(query)
    response = s.execute()
    data = response.to_dict()["hits"]["hits"]
    
    ids = set()
    
    for result in data:
        ids.add(result["_id"])
    
    
    return(None if len(ids) == 0 else list(ids)[0])

In [4]:
get_id(user_name)

'user-twitter-216939636'

In [3]:
# get name by id
def get_name(user_id):
    query = {
        "_source": ["screen_name"],
        "query": {"match":{"_id": user_id}}
    }
    
    s = Search(using=client, index="es.mmkg-user-*") # user indices
    s.update_from_dict(query)
    response = s.execute()
    data = response.to_dict()["hits"]["hits"]
    
    names = set()
    
    for result in data:
        names.add(result["_source"]["screen_name"])
    
    
    return(list(names)[0])

In [6]:
get_name(user_id)

'AndrewYNg'

In [4]:
# get tweets that mention the user
def query_users_retweets(user_name):
    user_id = get_id(user_name)
    query = {
        "size": 10000,
        "query": {
            "bool":{
                "must":[
                    {"match":{"user": user_id}},
                    {"exists":{"field": "retweeted_status"}}
                ]
            }
        }    
    }

    s = Search(using=client, index=topic_index) # user indices
    s.update_from_dict(query)
    response = s.execute()
    data = response.to_dict()["hits"]["hits"]
    output = []
    for tweet in data:
        tmp_dict = {}
        tmp_dict["_id"] = tweet["_id"]
        tmp_dict["timestamp"] = tweet["_source"]["timestamp"]
        tmp_dict["retweet_count"] = tweet["_source"]["retweet_count"]
        title = tweet["_source"]["title"]
        user_handle_find = re.findall(r'RT @([A-z]*[0-9]*):.*', title)
        if len(user_handle_find) > 0:
            user_handle = user_handle_find[0]
            tmp_dict["user"] = get_id(user_handle)
            output.append(tmp_dict)

    return output

In [8]:
query_users_retweets(user_name)

[{'_id': 'tweet-1097618472366829568',
  'timestamp': '2019-02-18T22:06:51+00:00',
  'retweet_count': 0,
  'user': 'user-twitter-9356982'},
 {'_id': 'tweet-1085736916559908864',
  'timestamp': '2019-01-17T03:13:47+00:00',
  'retweet_count': 0,
  'user': 'user-twitter-854124657695367168'}]

In [9]:
# get ids of all tweets by user
def query_retweets_of_user(user_name):
    query = {
        "size": 10000,
        "query": {
            "bool":{
                "must":[
                    {"match_phrase":{"title": "RT @{}:".format(user_name)}},
                    {"exists":{"field":"retweeted_status"}}
                ]
            }
        }
    }

    s = Search(using=client, index=topic_index) # user indices
    s.update_from_dict(query)
    response = s.execute()
    data = response.to_dict()["hits"]["hits"]
    output = []
    for tweet in data:
        tmp_dict = {}
        tmp_dict["_id"] = tweet["_id"]
        tmp_dict["user"] = tweet["_source"]["user"]
        tmp_dict["timestamp"] = tweet["_source"]["timestamp"]
        tmp_dict["retweet_count"] = tweet["_source"]["retweet_count"]
        output.append(tmp_dict)

    return output

In [10]:
query_retweets_of_user(user_name)

[{'_id': 'tweet-1086367727818686464',
  'user': 'user-twitter-811923527683149824',
  'timestamp': '2019-01-18T21:00:24+00:00',
  'retweet_count': 0},
 {'_id': 'tweet-1087109942497198080',
  'user': 'user-twitter-4346169252',
  'timestamp': '2019-01-20T22:09:42+00:00',
  'retweet_count': 0},
 {'_id': 'tweet-1084056928978759685',
  'user': 'user-twitter-789476240180842497',
  'timestamp': '2019-01-12T11:58:07+00:00',
  'retweet_count': 0},
 {'_id': 'tweet-1083956171583774720',
  'user': 'user-twitter-756519902001958912',
  'timestamp': '2019-01-12T05:17:44+00:00',
  'retweet_count': 0},
 {'_id': 'tweet-1090694577776984074',
  'user': 'user-twitter-612552494',
  'timestamp': '2019-01-30T19:33:46+00:00',
  'retweet_count': 0},
 {'_id': 'tweet-1086166867490807809',
  'user': 'user-twitter-2181815293',
  'timestamp': '2019-01-18T07:42:15+00:00',
  'retweet_count': 0},
 {'_id': 'tweet-1087059708651933698',
  'user': 'user-twitter-259725229',
  'timestamp': '2019-01-20T18:50:05+00:00',
  'retw

In [14]:
def get_tweet_links(handle):
    user_id = get_id(handle)
    users_retweets = query_users_retweets(handle)
    retweets_of_user = query_retweets_of_user(handle)
    
    score_dict = {}
    
    for tweet in users_retweets:
        if tweet["user"] not in score_dict:
            score_dict[tweet["user"]] = {"ego_influence_on_outer": 0, "outer_influence_on_ego": 1}
        else: 
            score_dict[tweet["user"]]["outer_influence_on_ego"] += 1
    
    for tweet in retweets_of_user:
        if tweet["user"] not in score_dict:
            score_dict[tweet["user"]] = {"ego_influence_on_outer": 1, "outer_influence_on_ego": 0}
        else: 
            score_dict[tweet["user"]]["ego_influence_on_outer"] += 1
    
    print(len(users_retweets), len(retweets_of_user))
    print(sum([x["ego_influence_on_outer"] + x["outer_influence_on_ego"] for x in score_dict.values()]))
    
    tweet_links = [{"name": key,**value} for key,value in score_dict.items()]
    
    return tweet_links

In [15]:
get_tweet_links(user_name)

2 62
64


[{'name': 'user-twitter-9356982',
  'ego_influence_on_outer': 0,
  'outer_influence_on_ego': 1},
 {'name': 'user-twitter-854124657695367168',
  'ego_influence_on_outer': 0,
  'outer_influence_on_ego': 1},
 {'name': 'user-twitter-811923527683149824',
  'ego_influence_on_outer': 1,
  'outer_influence_on_ego': 0},
 {'name': 'user-twitter-4346169252',
  'ego_influence_on_outer': 1,
  'outer_influence_on_ego': 0},
 {'name': 'user-twitter-789476240180842497',
  'ego_influence_on_outer': 1,
  'outer_influence_on_ego': 0},
 {'name': 'user-twitter-756519902001958912',
  'ego_influence_on_outer': 1,
  'outer_influence_on_ego': 0},
 {'name': 'user-twitter-612552494',
  'ego_influence_on_outer': 1,
  'outer_influence_on_ego': 0},
 {'name': 'user-twitter-2181815293',
  'ego_influence_on_outer': 1,
  'outer_influence_on_ego': 0},
 {'name': 'user-twitter-259725229',
  'ego_influence_on_outer': 3,
  'outer_influence_on_ego': 0},
 {'name': 'user-twitter-848324624',
  'ego_influence_on_outer': 1,
  'out

In [14]:
def score_tweet_links(handle, tweet_links, petals=15):
    max_single_edge_influence = max(sum([[x["ego_influence_on_outer"],x["outer_influence_on_ego"]] for x in tweet_links],[]))
    max_ego_to_outer_edge = max([x["ego_influence_on_outer"] for x in tweet_links])
    max_outer_to_ego_edge = max([x["outer_influence_on_ego"] for x in tweet_links])
    mean_ego_to_outer_edge = np.mean([x["ego_influence_on_outer"] for x in tweet_links])
    mean_outer_to_ego_edge = np.mean([x["outer_influence_on_ego"] for x in tweet_links])    
    
    max_both_edge_influence = max([x["ego_influence_on_outer"] + x["outer_influence_on_ego"] for x in tweet_links])
    
    top = sorted(tweet_links, key=lambda x: x["ego_influence_on_outer"] + x["outer_influence_on_ego"], reverse=True)[:min(petals,len(tweet_links))]
    
#     for row in top:
#         row["name"] = get_name(row["name"])
    
    G = nx.DiGraph()
    for row in top:
        ego_to_outer_attrs = {}
        ego_to_outer_attrs["weight"] = row["ego_influence_on_outer"]
        ego_to_outer_attrs["nweight"] = 0 if max_ego_to_outer_edge == 0 else (row["ego_influence_on_outer"])/max(max_ego_to_outer_edge,max_outer_to_ego_edge)
        ego_to_outer_attrs["direction"] = "in"
        ego_to_outer_attrs["dif"] = row["outer_influence_on_ego"] - row["ego_influence_on_outer"] # row["ego_influence_on_outer"] - row["outer_influence_on_ego"]
        ego_to_outer_attrs["ratiow"] = ego_to_outer_attrs["dif"] / (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])
        ego_to_outer_attrs["sum_w"] = (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])/max_both_edge_influence
        ego_to_outer_attrs["inf_in"] = row["outer_influence_on_ego"]
        ego_to_outer_attrs["inf_out"] = row["ego_influence_on_outer"]
        
        G.add_edge("ego", row["name"], **ego_to_outer_attrs)
        
        outer_to_ego = {}
        outer_to_ego["weight"] = row["outer_influence_on_ego"]
        outer_to_ego["nweight"] = 0 if max_outer_to_ego_edge == 0 else (row["outer_influence_on_ego"])/max(max_ego_to_outer_edge,max_outer_to_ego_edge)
        outer_to_ego["direction"] = "out"
        outer_to_ego["dif"] = row["outer_influence_on_ego"] - row["ego_influence_on_outer"] #row["ego_influence_on_outer"] - row["outer_influence_on_ego"]
        outer_to_ego["ratiow"] = ego_to_outer_attrs["dif"] / (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])
        outer_to_ego["sumw"] = (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])/max_both_edge_influence
        outer_to_ego["inf_in"] = row["outer_influence_on_ego"]
        outer_to_ego["inf_out"] = row["ego_influence_on_outer"]
        
        G.nodes[row["name"]]["dif"] = row["outer_influence_on_ego"] - row["ego_influence_on_outer"] #row["ego_influence_on_outer"] - row["outer_influence_on_ego"]
        G.nodes[row["name"]]["sumw"] = (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])/max_both_edge_influence
        G.nodes[row["name"]]["inf_in"] = row["outer_influence_on_ego"]
        G.nodes[row["name"]]["inf_out"] = row["ego_influence_on_outer"]
        G.nodes[row["name"]]["ratiow"] = ego_to_outer_attrs["dif"] / (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])
        G.nodes[row["name"]]["nratiow"] = (1+(ego_to_outer_attrs["dif"] / (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])))/2
        G.nodes[row["name"]]["coauthor"] = False
        G.nodes[row["name"]]["name"] = row["name"]
        G.nodes[row["name"]]["sum"] = row["outer_influence_on_ego"] + row["ego_influence_on_outer"]
        
        G.add_edge(row["name"], "ego", **outer_to_ego)
    
        
    G.add_node("ego", name=handle)
# ego university of michigan {'weight': 0.0, 'nweight': 0.0, 'direction': 'in', 'ratiow': 1.0, 'dif': 5.0, 'sumw': 0.2857142857142857, 'inf_in': 0.0, 'inf_out': 5.0}
# university of new south wales ego {'weight': 7.416666666666667, 'nweight': 0.6720481174848676, 'direction': 'out', 'ratiow': 0.424, 'dif': 4.416666666666667, 'sumw': 0.8015873015873017, 'inf_in': 3.0, 'inf_out': 7.416666666666667}

# ego national bureau of economic research {'weight': 0.0, 'nweight': 0.0, 'direction': 'in', 'ratiow': 1.0, 'dif': 12.5, 'sumw': 1.0, 'inf_in': 0.0, 'inf_out': 12.5}
# national bureau of economic research ego {'weight': 12.5, 'nweight': 1.0, 'direction': 'out', 'ratiow': 1.0, 'dif': 12.5, 'sumw': 1.0, 'inf_in': 0.0, 'inf_out': 12.5}
        

    G.graph = {"ego":"ego", 'max_influenced': max_ego_to_outer_edge, 'max_influencing': max_outer_to_ego_edge}
    nx.write_gpickle(G, "twitter_flower.gpickle")
    
    return G

# 

#### Use this cell to create a flower for the given twitter handle

In [17]:
handle = "ylecun"
topic_index = "es.mmkg-doc-cst04-*" # AI
tweet_links = get_tweet_links(handle)
tweet_links_score = score_tweet_links(handle, tweet_links)

# AI: es.mmkg-doc-cst04-*
# techreview (MIT Tech Review)

# AusPol: es.mmkg-doc-aus01-*
# abcnews
# politicsabc

7 28
35


## Analysis of tweet frequency
This section is used to try find interesting accounts to use

In [None]:
# This function gets a list of users that have retweeted and counts the number of times they 
#   have retweeted and been retweeted
def all_retweets(topic_index):
    # https://discuss.elastic.co/t/get-all-documents-from-an-index/86977/9
    start = datetime.now()
    
    users = {}
    retweeted_users = {}
    
    query = {
        "_source": ["user", "title"],
        "size": 10000,
        "query": {
            "bool":{
                "must":[
                    {"exists":{"field": "retweeted_status"}}
                ]
            }
        }    
    }

    res = client.search(index=topic_index, body=query, scroll="1m") # user indices
    scroll_id = res["_scroll_id"]
    
    count = 0
    
    for hit in res["hits"]["hits"]:
        user = hit["_source"]["user"]

        count += 1

        if user in users:
            users[user]["retweets"] += 1
        else:
            users[user] = {"retweets": 1, "retweeted": 0}

        title = hit["_source"]["title"]
        user_handle_find = re.findall(r'RT @([A-z]*[0-9]*):.*', title)
        if len(user_handle_find) > 0:
            user_handle = user_handle_find[0]
            if user_handle in retweeted_users:
                retweeted_users[user_handle] += 1
            else:
                retweeted_users[user_handle] = 1
    

    res = client.scroll(scroll_id = scroll_id, scroll="1m")
    scroll_id = res["_scroll_id"]
    num_scrolls = 1    
    
    
    # scroll elastic search until all results pulled down (because there is a return limit)
    while (len(res["hits"]["hits"]) > 0):
        num_scrolls += 1
        print(num_scrolls)
    
        
        for hit in res["hits"]["hits"]:
            user = hit["_source"]["user"]
            
            count += 1
            
            if user in users:
                users[user]["retweets"] += 1
            else:
                users[user] = {"retweets": 1, "retweeted": 0}
                
            title = hit["_source"]["title"]
            user_handle_find = re.findall(r'RT @([A-z]*[0-9]*):.*', title)
            if len(user_handle_find) > 0:
                user_handle = user_handle_find[0]
                if user_handle in retweeted_users:
                    retweeted_users[user_handle] += 1
                else:
                    retweeted_users[user_handle] = 1
                
        res = client.scroll(scroll_id = scroll_id, scroll="1m")
        scroll_id = res["_scroll_id"]


    for handle in retweeted_users:
        twitter_id = get_id(handle)
        if twitter_id in users:
            users[twitter_id]["name"] = handle
            users[twitter_id]["retweeted"] = retweeted_users[handle]
        else:
            users[twitter_id] = {"retweets": 0, "retweeted": retweeted_users[handle], "name": handle}
        
        if users[twitter_id]["retweets"] + users[twitter_id]["retweeted"] < 10:
            del users[twitter_id]
    print("took {} seconds".format((datetime.now()-start).total_seconds()))

    return users

In [None]:
# This calls all retweets and saves as a documented. Commented because it is a long process 
#    and avoid doing on accident


# topic_index = "es.mmkg-doc-cst04-*"
# users = all_retweets(topic_index)

# # save data to csv file
# start = datetime.now()
# print("saving file")
# with open("AI_retweet_count.csv","w") as fh:
#     writer = csv.writer(fh)
#     writer.writerow(["id","name","hasretweeted", "hasbeenretweeted"])
#     writer.writerows(
#         [
#             (user, 
#              users[user]["name"] if "name" in users[user] else get_name(user),
#              users[user]["retweets"],
#              users[user]["retweeted"])
#             for user in users
#             if users[user]["retweets"] + users[user]["retweets"] > 10 and 
#             users[user]["retweets"] > 0 and
#             users[user]["retweeted"] > 0
#         ]
#     )
    
# print("finished")
# print("took {} seconds".format((datetime.now()-start).total_seconds()))

In [None]:
import matplotlib.pyplot as plt


def plot_integer_distribution(ls, title, log=True):
    xs_ = [x for x in range(max(ls) + 1)]
    ys_ = [len([v for v in ls if v==x]) for x in xs_]
    
    xs = []
    ys = []
    
    for i, y in enumerate(ys_):
        if y > 0:
            xs.append(i)
            ys.append(y)
    if log==True:
        plt.xscale("log")
        plt.yscale("log")
    plt.scatter(xs,ys)
    plt.xlim(1,1500)
    plt.ylim(1,1500)
    plt.title(title)
    plt.ylabel("Number of occurences")

    plt.show()
    
num_retweets = [x["retweets"] for x in users.values()]
num_retweeted = [x["retweeted"] for x in users.values()]

# plt.show()
plot_integer_distribution(num_retweets, "Retweets")
plot_integer_distribution(num_retweeted, "Retweeted")

In [None]:
retweets, retweeted = zip(*[(x["retweets"], x["retweeted"]) for x in users.values()])

plt.scatter(retweets, retweeted)
# plt.xscale("log")
# plt.yscale("log")
plt.show()

In [None]:
[(users[user], get_name(user)) 
 for user in users 
 if 
 (users[user]["retweets"] > 10 
  and 
  users[user]["retweeted"] > 100)]


In [None]:
len([x for x in users if users[x]["retweets"] + users[x]["retweets"] > 10 and 
            users[x]["retweets"] > 100 and
            users[x]["retweeted"] > 20])

In [None]:
df = pd.read_csv("AI_retweet_count.csv")

In [None]:
df.sort_values(by="hasbeenretweeted", ascending=False).head(200)

## Use tweet dump to create flowers

In [5]:
# load data
file = "../../ylecun_100K.dump"
with open(file,'r') as fh:
    data = json.load(fh)

In [6]:
data[0].keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'metadata', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'lang'])

In [7]:
data[0]

{'created_at': 'Thu Mar 28 21:36:26 +0000 2019',
 'id': 1111381559662141440,
 'id_str': '1111381559662141440',
 'text': "RT @mounir: Félicitations @ylecun, qui reçoit l'#ACMTuringAward pour ses travaux en matière d'intelligence artificielle, notamment sur les…",
 'truncated': False,
 'entities': {'hashtags': [{'text': 'ACMTuringAward', 'indices': [48, 63]}],
  'symbols': [],
  'user_mentions': [{'screen_name': 'mounir',
    'name': 'Mounir Mahjoubi',
    'id': 80528373,
    'id_str': '80528373',
    'indices': [3, 10]},
   {'screen_name': 'ylecun',
    'name': 'Yann LeCun',
    'id': 48008938,
    'id_str': '48008938',
    'indices': [26, 33]}],
  'urls': []},
 'metadata': {'iso_language_code': 'fr', 'result_type': 'recent'},
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,

In [8]:
# number of tweets by Yann
len([1 for x in data if x["user"]["screen_name"] =="ylecun"])

11

In [9]:
# Number of tweets mentioning Ylecun
len(
    [1 for x in data 
     if (
         (len([1 for y in x["entities"]["user_mentions"] if y["screen_name"]=="ylecun"]) >= 1)
     )
    ]
)

7643

In [10]:
user_followers = {tweet["user"]["screen_name"]: tweet["user"]["followers_count"] for tweet in data}

def get_links():
    users_rts = []
    rts_of_user = []
    for tweet in data:
        if tweet["user"]["screen_name"] == "ylecun":
            for user in tweet["entities"]["user_mentions"]:
                users_rts.append({"user": user["name"], "handle":user["screen_name"], "rtcount": tweet["retweet_count"], "followers": user_followers[user["screen_name"]]})
        elif len([1 for user in tweet["entities"]["user_mentions"] if user["screen_name"]=="ylecun"])>=1:
            rts_of_user.append({"user": tweet["user"]["name"], "handle":tweet["user"]["screen_name"], "rtcount": tweet["retweet_count"], "followers": tweet["user"]["followers_count"]})
    return users_rts, rts_of_user


In [11]:
def structure_tweet_links(users_rts, rts_of_users, weight=None):
    users_retweets = users_rts
    retweets_of_user = rts_of_user
    
    score_dict = {}
    
    for tweet in users_retweets:
        if tweet["user"] not in score_dict:
            score_dict[tweet["user"]] = {"handle": tweet["handle"], "ego_influence_on_outer": 0, "outer_influence_on_ego": 1 if weight is None else tweet[weight]}
        else: 
            score_dict[tweet["user"]]["outer_influence_on_ego"] += 1 if weight is None else tweet[weight]
    
    for tweet in retweets_of_user:
        if tweet["user"] not in score_dict:
            score_dict[tweet["user"]] = {"handle": tweet["handle"], "ego_influence_on_outer": 1 if weight is None else tweet[weight], "outer_influence_on_ego": 0}
        else: 
            score_dict[tweet["user"]]["ego_influence_on_outer"] += 1 if weight is None else tweet[weight]
    
    print(len(users_retweets), len(retweets_of_user))
    print(sum([x["ego_influence_on_outer"] + x["outer_influence_on_ego"] for x in score_dict.values()]))
    
    tweet_links = [{"name": key,**value} for key,value in score_dict.items()]
    
    return tweet_links

In [15]:
# handle = "Yann LeCun"
# handle = "@Yann LeCun"
# handle = "Yann LeCun @ylecun"
handle = "@ylecun"

users_rts, rts_of_user = get_links() 
tweet_links = structure_tweet_links(users_rts, rts_of_user, weight="followers")


# for link in tweet_links:
#     link["name"] = link["name"] + " @" + link["handle"]
    
# for link in tweet_links:
#     link["name"] = "@" + link["name"]

for link in tweet_links:
    link["name"] = "@" + link["handle"]
    

tweet_links_score = score_tweet_links(handle, tweet_links, petals = 25)

32 7633
66151695


In [28]:
def unique(list1): 
  
    # intilize a null list 
    unique_list = []
    output=[]
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x[1] not in unique_list: 
            output.append(x) 
            unique_list.append(x[1])
    # print list 
    return output

users = list(set((x["user"]["name"], x["user"]["screen_name"], x["user"]["followers_count"]) for x in data))
users = unique(sorted(users, key=lambda x: x[2], reverse=True))
users[:25]

[('Facebook', 'facebook', 13516839),
 ('Emmanuel Macron', 'EmmanuelMacron', 3781373),
 ('Nature News & Comment', 'NatureNews', 1900056),
 ('Satya Nadella', 'satyanadella', 1861411),
 ('NVIDIA', 'nvidia', 1434465),
 ('Europe 1 📻', 'Europe1', 1418525),
 ('Google AI', 'GoogleAI', 1329758),
 ('الخارجية الفرنسية', 'francediplo_AR', 398730),
 ('Andrew Ng', 'AndrewYNg', 389995),
 ('Coursera', 'coursera', 388778),
 ('Microsoft Research', 'MSFTResearch', 387981),
 ('La French Tech', 'LaFrenchTech', 353586),
 ('NBC Bay Area', 'nbcbayarea', 319528),
 ('Yicai Global 第一财经', 'yicaichina', 265441),
 ('Facebook Engineering', 'fb_engineering', 239633),
 ('Kirk Borne', 'KirkDBorne', 226423),
 ('DeepMind', 'DeepMindAI', 223067),
 ('Montreal.AI', 'Montreal_AI', 185563),
 ('Vincent Boucher 🕊', 'ceobillionaire', 178065),
 ('Ronald van Loon #HM19', 'Ronald_vanLoon', 177772),
 ('Erik Brynjolfsson', 'erikbryn', 176538),
 ('Mounir Mahjoubi', 'mounir', 172927),
 ('Montréal.IA', 'Montreal_IA', 169017),
 ('Québec.

In [17]:
# this second function for structuring tweet links is to allow edges to be counted individually and weighted
#    differently from the node size
def structure_tweet_links2(users_rts, rts_of_users, weight=None):
    users_retweets = users_rts
    retweets_of_user = rts_of_user
    
    score_dict = {}
    
    for tweet in users_retweets:
        if tweet["user"] not in score_dict:
            score_dict[tweet["user"]] = {"handle": tweet["handle"], "ego_influence_on_outer": 0, "outer_influence_on_ego": 1 if weight is None else tweet[weight], "num_outer_influence_on_ego": 1, "num_ego_influence_on_outer": 0}
        else: 
            score_dict[tweet["user"]]["outer_influence_on_ego"] += 1 if weight is None else tweet[weight]
            score_dict[tweet["user"]]["num_outer_influence_on_ego"] += 1
    
    for tweet in retweets_of_user:
        if tweet["user"] not in score_dict:
            score_dict[tweet["user"]] = {"handle": tweet["handle"], "ego_influence_on_outer": 1 if weight is None else tweet[weight], "outer_influence_on_ego": 0, "num_ego_influence_on_outer": 1, "num_outer_influence_on_ego": 0}
        else: 
            score_dict[tweet["user"]]["ego_influence_on_outer"] += 1 if weight is None else tweet[weight]
            score_dict[tweet["user"]]["num_ego_influence_on_outer"] += 1
    
    print(len(users_retweets), len(retweets_of_user))
    print(sum([x["ego_influence_on_outer"] + x["outer_influence_on_ego"] for x in score_dict.values()]))
    
    tweet_links = [{"name": key,**value} for key,value in score_dict.items()]
    
    return tweet_links

In [29]:
def score_tweet_links2(handle, tweet_links, petals=15):
    max_single_edge_influence = max(sum([[x["ego_influence_on_outer"],x["outer_influence_on_ego"]] for x in tweet_links],[]))
    max_ego_to_outer_edge = max([x["ego_influence_on_outer"] for x in tweet_links])
    max_outer_to_ego_edge = max([x["outer_influence_on_ego"] for x in tweet_links])
    mean_ego_to_outer_edge = np.mean([x["ego_influence_on_outer"] for x in tweet_links])
    mean_outer_to_ego_edge = np.mean([x["outer_influence_on_ego"] for x in tweet_links])    
    
    max_both_edge_influence = max([x["ego_influence_on_outer"] + x["outer_influence_on_ego"] for x in tweet_links])
    
    max_both_edge_influence_count = max([x["num_ego_influence_on_outer"] + x["num_outer_influence_on_ego"] for x in tweet_links])
    
    top = sorted(tweet_links, key=lambda x: x["ego_influence_on_outer"] + x["outer_influence_on_ego"], reverse=True)[:min(petals,len(tweet_links))]
    
#     for row in top:
#         row["name"] = get_name(row["name"])
    
    G = nx.DiGraph()
    for order, row in enumerate(top):
        ego_to_outer_attrs = {}
        ego_to_outer_attrs["weight"] = row["ego_influence_on_outer"]
        ego_to_outer_attrs["nweight"] = 0 if max_ego_to_outer_edge == 0 else (row["num_ego_influence_on_outer"])/max_both_edge_influence_count
        ego_to_outer_attrs["direction"] = "in"
        ego_to_outer_attrs["dif"] = row["outer_influence_on_ego"] - row["ego_influence_on_outer"] # row["ego_influence_on_outer"] - row["outer_influence_on_ego"]
        ego_to_outer_attrs["ratiow"] = ego_to_outer_attrs["dif"] / (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])
        ego_to_outer_attrs["sumw"] = (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])/max_both_edge_influence
        ego_to_outer_attrs["inf_in"] = row["outer_influence_on_ego"]
        ego_to_outer_attrs["inf_out"] = row["ego_influence_on_outer"]
        
        G.add_edge("ego", row["name"], **ego_to_outer_attrs)
        
        outer_to_ego = {}
        outer_to_ego["weight"] = row["outer_influence_on_ego"]
        outer_to_ego["nweight"] = 0 if max_outer_to_ego_edge == 0 else (row["num_outer_influence_on_ego"])/max_both_edge_influence_count
        outer_to_ego["direction"] = "out"
        outer_to_ego["dif"] = row["outer_influence_on_ego"] - row["ego_influence_on_outer"] #row["ego_influence_on_outer"] - row["outer_influence_on_ego"]
        outer_to_ego["ratiow"] = ego_to_outer_attrs["dif"] / (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])
        outer_to_ego["sumw"] = (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])/max_both_edge_influence
        outer_to_ego["inf_in"] = row["outer_influence_on_ego"]
        outer_to_ego["inf_out"] = row["ego_influence_on_outer"]
        
        G.nodes[row["name"]]["dif"] = row["outer_influence_on_ego"] - row["ego_influence_on_outer"] #row["ego_influence_on_outer"] - row["outer_influence_on_ego"]
        G.nodes[row["name"]]["sumw"] = (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])/max_both_edge_influence
        G.nodes[row["name"]]["inf_in"] = row["outer_influence_on_ego"]
        G.nodes[row["name"]]["inf_out"] = row["ego_influence_on_outer"]
        G.nodes[row["name"]]["ratiow"] = ego_to_outer_attrs["dif"] / (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])
        G.nodes[row["name"]]["nratiow"] = (1+(ego_to_outer_attrs["dif"] / (row["ego_influence_on_outer"] + row["outer_influence_on_ego"])))/2
        G.nodes[row["name"]]["coauthor"] = False
        G.nodes[row["name"]]["name"] = row["name"]
        G.nodes[row["name"]]["sum"] = row["outer_influence_on_ego"] + row["ego_influence_on_outer"]
        G.nodes[row["name"]]["bloom_order"] = order
        G.add_edge(row["name"], "ego", **outer_to_ego)
    
        
    G.add_node("ego", name=handle)
# ego university of michigan {'weight': 0.0, 'nweight': 0.0, 'direction': 'in', 'ratiow': 1.0, 'dif': 5.0, 'sumw': 0.2857142857142857, 'inf_in': 0.0, 'inf_out': 5.0}
# university of new south wales ego {'weight': 7.416666666666667, 'nweight': 0.6720481174848676, 'direction': 'out', 'ratiow': 0.424, 'dif': 4.416666666666667, 'sumw': 0.8015873015873017, 'inf_in': 3.0, 'inf_out': 7.416666666666667}

# ego national bureau of economic research {'weight': 0.0, 'nweight': 0.0, 'direction': 'in', 'ratiow': 1.0, 'dif': 12.5, 'sumw': 1.0, 'inf_in': 0.0, 'inf_out': 12.5}
# national bureau of economic research ego {'weight': 12.5, 'nweight': 1.0, 'direction': 'out', 'ratiow': 1.0, 'dif': 12.5, 'sumw': 1.0, 'inf_in': 0.0, 'inf_out': 12.5}
        

    G.graph = {"ego":"ego", 'max_influenced': max_ego_to_outer_edge, 'max_influencing': max_outer_to_ego_edge}
    nx.write_gpickle(G, "twitter_flower.gpickle")
    
    return G

# 

In [30]:
# handle = "Yann LeCun"
# handle = "@Yann LeCun"
# handle = "Yann LeCun @ylecun"
handle = "@ylecun"

users_rts, rts_of_user = get_links() 
tweet_links = structure_tweet_links2(users_rts, rts_of_user, weight="followers")


# for link in tweet_links:
#     link["name"] = link["name"] + " @" + link["handle"]
    
# for link in tweet_links:
#     link["name"] = "@" + link["name"]

for link in tweet_links:
    link["name"] = "@" + link["handle"]
    

tweet_links_score = score_tweet_links2(handle, tweet_links, petals = 25)

32 7633
66151695


In [31]:
links = list(tweet_links_score.edges(data=True))

In [32]:
links[0][2]

{'weight': 7562746,
 'nweight': 0.058823529411764705,
 'direction': 'in',
 'dif': 0,
 'ratiow': 0.0,
 'sumw': 1.0,
 'inf_in': 7562746,
 'inf_out': 7562746}