In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pickle

In [2]:
%%capture

# check if this is a colab notebook and clone the repo if it is
if 'COLAB_GPU' in os.environ:
    %cd /content/
    !git clone https://github.com/Enver-group/twitch-web-analytics
    %cd twitch-web-analytics
    !gdown --id 11IfXaA66-D7vjA2R46uAilBgOPxYmKmn
else:
    os.chdir('..')

!pip install -r requirements.txt

!pip install -e .

In [3]:
#autoreload modules
%load_ext autoreload
%autoreload 2

from src.user import User

In [4]:
df = pd.read_feather("data/streamers.feather")

#TODO: put this cell in the code that generates the dataframe

# removing users that have no data about which users they follow
num_users = df.shape[0]
df.dropna(subset=['num_followers'],inplace=True)
print(num_users - df.shape[0], "users have been removed for having null data in num_followers")

# remove from each array with users followed (the arrays that are in 'user_follows') the users that are not in the dataset
df_streamers_exploded = df.explode("user_follows")
num_users_followed = len(df_streamers_exploded["user_follows"].unique())
df_streamers_exploded = df_streamers_exploded[df_streamers_exploded["user_follows"].isin(df.id)]
print(num_users_followed - len(df_streamers_exploded["user_follows"].unique()), "users have been removed from the arrays in 'user_follows'")
user_follows_arrays = df_streamers_exploded.groupby("id").user_follows.apply(np.array).reset_index()
df = df.set_index("id")
df.loc[user_follows_arrays.id,"user_follows"] = user_follows_arrays.user_follows.values
# assign empty arrays to the users removed for not following any user of the dataset
not_in_set_or_null = ~df.index.isin(user_follows_arrays.id)
df.loc[not_in_set_or_null,"user_follows"] =  pd.Series([[]]*not_in_set_or_null.sum()).values
df.reset_index(level=0, inplace=True)
print(not_in_set_or_null.sum(), "users don't follow to any user after applying the previous filters")


70599 users have been removed for having null data in num_followers
51735 users have been removed from the arrays in 'user_follows'
68 users don't follow to any user after applying the previous filters


In [5]:
# converting the df into a graph
G = nx.DiGraph()
for i,user in df.iterrows():
  G.add_node(user["id"], **user.drop(["id","user_follows"]))
  if isinstance(user["user_follows"], np.ndarray):
    G.add_edges_from([(user["id"], id_followed) for id_followed in user["user_follows"]]) 

In [6]:
# checking the correctness of the graph
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges(), "\n")
for i, user in enumerate(G.degree()):
  if i >10:
    break
  print(G.nodes[user[0]]["name"], "is following to", user[1], "users")

Number of nodes: 5031
Number of edges: 474107 

auronplay is following to 2575 users
josecristo_ is following to 219 users
xXxTheFocuSxXx is following to 245 users
Ampeterby7 is following to 539 users
javiDMr10 is following to 143 users
gtv_genesis is following to 284 users
Tanizen is following to 531 users
Carola is following to 682 users
CooLifeGame is following to 940 users
Luzu is following to 669 users
ZormanWorld is following to 292 users


In [7]:
# # another approach creating the graph from the exploded df (TODO: understand why the result is not equivalent)

# df_streamers = pd.read_feather("data/streamers.feather")
# df_streamers.dropna(subset=['num_followers'],inplace=True)
# edges = df_streamers.explode("user_follows")[["id","user_follows"]]
# edges = edges[edges["user_follows"].isin(df_streamers.id)]

# G = nx.from_pandas_edgelist(edges, 'id', 'user_follows')
# for col_name in df_streamers.drop(['id', 'user_follows'], axis=1):
#   nx.set_node_attributes(G, pd.Series(df_streamers[col_name], index=df_streamers["id"]).to_dict(), col_name)

# print("Number of nodes:", G.number_of_nodes())
# print("Number of edges:", G.number_of_edges(), "\n")
# for i, user in enumerate(G.degree()):
#   if i >10:
#     break
#   print(G.nodes[user[0]]["name"], "is following to", user[1], "users")


## Graph analysis

In [8]:
# compute fundamental metrics(or load if they'be been already computed)

metrics = {"indegree":nx.in_degree_centrality, "outdegree":nx.out_degree_centrality, "closeness":nx.closeness_centrality, 
            "betweenness":nx.betweenness_centrality, "pagerank":nx.pagerank}

# create folder to save the metrics if needed
folder_name = "data/fundamental_metrics"
if not os.path.exists(folder_name):
  os.mkdir(folder_name)

for m in metrics:
  file_name = f"{folder_name}/{m}.pkl"
  if os.path.exists(file_name):
    # load metric
    with open(file_name, 'rb') as f:
      metrics[m] = pickle.load(f)
  else:
    # compute metric
    computed_metric = metrics[m](G)
    # sort metric from highest to lowest
    metrics[m] = dict(sorted(computed_metric.items(), key=lambda x: x[1],reverse=True))
    # store metric
    with open(file_name, 'wb') as f:
        pickle.dump(metrics[m], f)
  # display 10 first items of metric
  print(f"\n{m}:", dict(list(metrics[m].items())[:10]))



indegree: {'83232866': 0.6838966202783301, '39276140': 0.5777335984095427, '459331509': 0.5033797216699801, '48878319': 0.45268389662027836, '36473331': 0.3453280318091451, '31919607': 0.34393638170974156, '512977322': 0.28151093439363817, '57793021': 0.27137176938369784, '123922797': 0.2697813121272366, '70357283': 0.2481113320079523}

outdegree: {'42219929': 0.1433399602385686, '38617117': 0.11809145129224652, '161339757': 0.10834990059642148, '38587710': 0.10616302186878727, '60978790': 0.10437375745526839, '253842308': 0.10357852882703777, '129224135': 0.10357852882703777, '276673966': 0.10318091451292247, '55347167': 0.1021868787276342, '46233206': 0.1021868787276342}

closeness: {'83232866': 0.7492422459110368, '39276140': 0.6906630062689043, '459331509': 0.6541322232645903, '48878319': 0.6308186264643544, '31919607': 0.5873583555777275, '36473331': 0.5848269416185754, '123922797': 0.5615656332153075, '512977322': 0.5614365525043052, '57793021': 0.5604060391579626, '121510236': 

In [9]:
#obtain per each metric 10 nodes with the highest values
df_ranking_metrics = pd.DataFrame()
for m in metrics:
  ranking10_ids = list(metrics[m].keys())[:10]
  ranking10_names = [G.nodes[node_id]["name"] for node_id in ranking10_ids]
  df_ranking_metrics[m] = ranking10_names
print("Graph fundamental metrics:")

df_ranking_metrics

Graph fundamental metrics:


Unnamed: 0,indegree,outdegree,closeness,betweenness,pagerank
0,ibai,malaso,ibai,coscu,ibai
1,Rubius,DarkozTV,Rubius,TheGrefg,Rubius
2,auronplay,Hydr4G,auronplay,IamCristinini,auronplay
3,TheGrefg,JaazminG,TheGrefg,DuendePablo,TheGrefg
4,coscu,iEluney,elxokas,Sibrel,elxokas
5,elxokas,Ardillaloca24,coscu,ibai,IlloJuan
6,SLAKUN10,BestorVD,IamCristinini,Rubius,LVPes
7,LOLITOFDEZ,LeandroRiccio,SLAKUN10,juansguarnizo,ESL_csgo_es
8,IamCristinini,Sibrel,LOLITOFDEZ,DarkozTV,alexelcapo
9,AriGameplays,iDamFox,juansguarnizo,TELLIER50,IamCristinini


In [10]:
# compute k-core descomposition
nx_cores = nx.algorithms.core.core_number(G)
print("\nFirst 10 users with the cores given by NetworkX:\n", dict(list(nx_cores.items())[:10]))

# reformat NetworkX solution
nx_cores_format = {c:set() for c in set(nx_cores.values())}
for node in nx_cores:
  nx_cores_format[nx_cores[node]].add(node)
print("\nFirst 10 cores with their users associated:\n", dict(list(nx_cores_format.items())[:10]))

highest_core = list(nx_cores.values())[0]
print("\nThe highest core number is", highest_core)
print("\nThe users that have the highest core are:")
users_highset_core = []
for node_id in nx_cores_format[highest_core]:
  user_name = G.nodes[node_id]["name"]
  users_highset_core.append(user_name)
  print(user_name, end=", ")


First 10 users with the cores given by NetworkX:
 {'459331509': 179, '210708721': 133, '431460701': 133, '77649106': 138, '130065491': 114, '427412996': 137, '40299581': 137, '110405254': 137, '42814514': 138, '66370849': 138}

First 10 cores with their users associated:
 {0: {'620773811'}, 1: {'439373923', '400033906', '442505580', '236190404'}, 2: {'20619890', '455809385', '99286454'}, 3: {'524567698', '138218530', '162503630', '632828507', '489135756', '24813581'}, 4: {'529700302', '177296909', '105915313', '611282309', '507151739', '605649930', '552105043', '728541573', '116205426'}, 5: {'531366537', '543519001', '45597067', '58350953', '83025891', '41930562', '87455204'}, 6: {'92098894', '675517900', '71135659', '583178716', '179096261', '466365220', '692907111', '112100325', '94175451'}, 7: {'37660554', '675798357', '534211692', '42386491', '136625428', '37660816'}, 8: {'659742503', '485395238', '129543937', '111807900', '109931391', '25303197', '688077535', '136501897', '648149

In [11]:
# order the imporant users (according to fundamental metrics) by core number
important_users_names = set()
for m in df_ranking_metrics.drop("outdegree", axis=1):
  for user_name in df_ranking_metrics[m]:
    important_users_names.add(user_name)
print("The users that appear one or more times in the rankings of the previous metrics are:")
print(", ".join(important_users_names))
# extract ids from important users
important_users_ids = list(df[df["name"].isin(important_users_names)]["id"])

important_users_cores = { user_id: nx_cores[user_id] for user_id in important_users_ids}
# reformat NetworkX solution
important_users_cores_format = {c:set() for c in set(important_users_cores.values())}
for node in important_users_cores:
  important_users_cores_format[important_users_cores[node]].add(node)
important_users_cores_format = dict(sorted(important_users_cores_format.items(), key=lambda x: x[0],reverse=True))

print("\n\nPrevious users ordered by core number:")
for core in important_users_cores_format:
  print(f"\nWith core number {core}:")
  print(", ".join([G.nodes[node_id]["name"] for node_id in important_users_cores_format[core]]))

The users that appear one or more times in the rankings of the previous metrics are:
DarkozTV, LOLITOFDEZ, IlloJuan, TheGrefg, DuendePablo, alexelcapo, LVPes, AriGameplays, juansguarnizo, elxokas, coscu, Rubius, auronplay, SLAKUN10, IamCristinini, ibai, Sibrel, ESL_csgo_es, TELLIER50


Previous users ordered by core number:

With core number 179:
SLAKUN10, TELLIER50, auronplay, ibai, coscu, Rubius, DuendePablo, TheGrefg

With core number 177:
IamCristinini

With core number 168:
DarkozTV, LOLITOFDEZ

With core number 161:
AriGameplays

With core number 154:
elxokas, juansguarnizo

With core number 138:
IlloJuan, LVPes, alexelcapo

With core number 135:
Sibrel

With core number 133:
ESL_csgo_es
