In [1]:
import networkx as nx
import community  
import json
from langdetect import detect, LangDetectException
import pandas as pd

import json
import networkx as nx
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import community
import random
import csv

import nltk
import string
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
# https://medium.com/@monigrancharov/text-language-detection-with-python-beb49d9667b3

In [2]:
# jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [3]:
fJsonName_youtube = 'NEW_youtube_olympics_data_limited_250_per_hashtag.json'
with open(fJsonName_youtube, 'r') as f:
    ySubmissions = json.load(f)

# List to store combined data for English posts only
olympics_post2 = []

for sub in ySubmissions:
    try:
        # Detect the language of both title and description
        title_language = detect(sub['Video Title'])
        description_language = detect(sub['Description'])

        # Include the post only if both title and description are in English
        if title_language == 'en' and description_language == 'en':
            # Create a dictionary for the post itself
            post_data = {
                'title': sub['Video Title'],
                'text': sub['Description'],
                'olympics': sub['Hashtag'],
                'author': sub['Video Tuthor'], 
                'comments': []  
            }

            # Iterate through comments and add only English comments
            for com in sub['Comments']:
                comment_text = com['text']
                try:
                    # Detect if the comment is in English
                    comment_language = detect(comment_text)
                    if comment_language == 'en':
                        comment_data = {
                            'author': com['author'],
                            'text': comment_text,
                            'created': com['published_at']
                        }
                        post_data['comments'].append(comment_data)
                except LangDetectException:
                    continue

            # Append the post with its associated English comments to the combined_data list
            olympics_post2.append(post_data)
    except LangDetectException:
        continue


paris2024 = []
tokyo2020 = []
rio2016 = []


for post in olympics_post2:
    if post['olympics'] == "#paris2024":
        paris2024.append(post)
    elif post['olympics'] == "#tokyo2020":
        tokyo2020.append(post)
    elif post['olympics'] == "#rio2016":
        rio2016.append(post)

In [4]:
def write_graphml(graph, graphfilename):
    nx.write_graphml(graph, graphfilename)
    print(f"Subgraph saved as {graphfilename}")

In [5]:
def build_youtube_graph_video(subset):
    youtubeG = nx.Graph()

    # Iterate through each post in the dataset
    for video in subset:
        if video['title']:
            # Add the video node with attributes
            youtubeG.add_node(video['title'], type='video', hashtag=str(video['olympics']) if video['olympics'] else "")
            
            # Add nodes and edges for each comment author and link them to the corresponding video
            for comment in video['comments']:
                author = comment.get('author', '').strip()
                    
                if author:
                    if not youtubeG.has_node(author):
                        youtubeG.add_node(author, type='author')
                        
                    # Add a directed edge from author to the video they commented on
                    youtubeG.add_edge(author, video['title'])
                    
    # Perform community detection using the Louvain method
    partition = community.best_partition(youtubeG)

    # Assign the community information as an attribute to each node
    nx.set_node_attributes(youtubeG, partition, 'community')

    # Return the graph with community detection information
    return youtubeG

In [6]:
def build_youtube_graph_author(subset):
    youtubeG = nx.Graph()

    # Iterate through each post in the dataset
    for video in subset:
        if video['author']:
            # Add the video node with attributes
            youtubeG.add_node(video['author'], type='author', hashtag=str(video['olympics']) if video['olympics'] else "")
            
            # Add nodes and edges for each comment author and link them to the corresponding video
            for comment in video['comments']:
                author = comment.get('author', '').strip()
                    
                if author:
                    if not youtubeG.has_node(author):
                        youtubeG.add_node(author, type='author')
                        
                    # Add a directed edge from author to the video they commented on
                    youtubeG.add_edge(author, video['author'])
                    
    # Perform community detection using the Louvain method
    partition = community.best_partition(youtubeG)

    # Assign the community information as an attribute to each node
    nx.set_node_attributes(youtubeG, partition, 'community')

    # Return the graph with community detection information
    return youtubeG

In [7]:
def compute_centrality_measures(graph, degree_threshold=1, betweenness_k=None):
    graph = graph.copy()
    if not graph.is_directed():
        graph = graph.to_directed()

     # Filter out nodes with a degree below the threshold
    filtered_nodes = [n for n, d in graph.degree() if d < degree_threshold]
    graph.remove_nodes_from(filtered_nodes)
    
    components = nx.weakly_connected_components(graph)
    largest_component = max(components, key=len)
    graph = graph.subgraph(largest_component).copy()  
    try:
        # Calculate various centrality measures
        in_degree_centrality = nx.in_degree_centrality(graph)
        degree_centrality = nx.degree_centrality(graph)
        betweenness_centrality = nx.betweenness_centrality(graph, k=betweenness_k) if betweenness_k else {}
        closeness_centrality = nx.closeness_centrality(graph)  # Works on undirected graphs
        eigenvector_centrality = nx.eigenvector_centrality(graph, max_iter=5000)
        
        # Combine the centrality measures into a DataFrame
        centrality_df = pd.DataFrame({
            'in_degree_centrality': in_degree_centrality,
            'degree_centrality':degree_centrality,
            'betweenness_centrality': betweenness_centrality,
            'closeness_centrality': closeness_centrality,
            'eigenvector_centrality': eigenvector_centrality
        })

    
    except nx.NetworkXError as e:
        print(f"Centrality Calculation Error: {e}")
        centrality_df = pd.DataFrame()
        
    return centrality_df

In [8]:
# usage for different graphs- for aurhor
author_youtubeG_olympic2024 = build_youtube_graph_author(paris2024)
author_youtubeG_olympic2020 = build_youtube_graph_author(tokyo2020)
author_youtubeG_olympic2016 = build_youtube_graph_author(rio2016)
author_youtubeG_olympic = build_youtube_graph_author(olympics_post2)

author_centrality_2024 = compute_centrality_measures(author_youtubeG_olympic2024, degree_threshold=5,betweenness_k=10)
author_centrality_2020 = compute_centrality_measures(author_youtubeG_olympic2020, degree_threshold=5,betweenness_k=10)
author_centrality_2016 = compute_centrality_measures(author_youtubeG_olympic2016, degree_threshold=5,betweenness_k=10)

In [9]:
author_centrality_all = compute_centrality_measures(author_youtubeG_olympic, degree_threshold=5,betweenness_k=10)

In [10]:
# usage for different graphs- for video
video_youtubeG_olympic2024 = build_youtube_graph_video(paris2024)
video_youtubeG_olympic2020 = build_youtube_graph_video(tokyo2020)
video_youtubeG_olympic2016 = build_youtube_graph_video(rio2016)
video_youtubeG_olympic = build_youtube_graph_video(olympics_post2)

video_centrality_2024 = compute_centrality_measures(video_youtubeG_olympic2024, degree_threshold=5,betweenness_k=10)
video_centrality_2020 = compute_centrality_measures(video_youtubeG_olympic2020, degree_threshold=5,betweenness_k=10)
video_centrality_2016 = compute_centrality_measures(video_youtubeG_olympic2016, degree_threshold=5,betweenness_k=10)

In [11]:
# usage for different graphs- for video
video_centrality_all = compute_centrality_measures(video_youtubeG_olympic, degree_threshold=5,betweenness_k=10)

In [12]:
def extract_max_betweenness_degree_subgraph(graph, centrality_df, output_filename):
    max_betweenness_degree = centrality_df['betweenness_centrality'].idxmax()
    print(f"With highest betweenness-degree centrality: {max_betweenness_degree}")
    
    # Get the subgraph of this author and their neighbors (immediate connections)
    subgraph = graph.subgraph([max_betweenness_degree] + list(graph.neighbors(max_betweenness_degree)))
    
    # Save the subgraph as a GraphML file
    nx.write_graphml(subgraph, output_filename)
    print(f"Subgraph saved as {output_filename}")

In [13]:
def extract_max_in_degree_subgraph(graph, centrality_df, output_filename):
    max_in_degree = centrality_df['in_degree_centrality'].idxmax()
    print(f"With highest in-degree centrality: {max_in_degree}")
    
    subgraph = graph.subgraph([max_in_degree] + list(graph.neighbors(max_in_degree)))
    
    nx.write_graphml(subgraph, output_filename)
    print(f"Subgraph saved as {output_filename}")

In [14]:
#find the largest betweenness degree-for author
extract_max_betweenness_degree_subgraph(author_youtubeG_olympic2024, author_centrality_2024, "betweenness_author_2024_ytb.graphml")
extract_max_betweenness_degree_subgraph(author_youtubeG_olympic2020, author_centrality_2020, "betweenness_author_2020_ytb.graphml")
extract_max_betweenness_degree_subgraph(author_youtubeG_olympic2016, author_centrality_2016, "betweenness_author_2016_ytb.graphml")
extract_max_betweenness_degree_subgraph(author_youtubeG_olympic, author_centrality_all, "betweenness_author_olympic_ytb.graphml")

With highest betweenness-degree centrality: Olympics
Subgraph saved as betweenness_author_2024_ytb.graphml
With highest betweenness-degree centrality: Olympics
Subgraph saved as betweenness_author_2020_ytb.graphml
With highest betweenness-degree centrality: Olympics
Subgraph saved as betweenness_author_2016_ytb.graphml
With highest betweenness-degree centrality: Olympics
Subgraph saved as betweenness_author_olympic_ytb.graphml


In [15]:
#find the largestin-degree degree-for author
extract_max_in_degree_subgraph(author_youtubeG_olympic2024, author_centrality_2024, "in_degree_author_2024_ytb.graphml")
extract_max_in_degree_subgraph(author_youtubeG_olympic2020, author_centrality_2020, "in_degree_author_2020_ytb.graphml")
extract_max_in_degree_subgraph(author_youtubeG_olympic2016, author_centrality_2016, "in_degree_author_2016_ytb.graphml")
extract_max_in_degree_subgraph(author_youtubeG_olympic, author_centrality_all, "in_degree_author_olympic_ytb.graphml")

With highest in-degree centrality: Olympics
Subgraph saved as in_degree_author_2024_ytb.graphml
With highest in-degree centrality: Olympics
Subgraph saved as in_degree_author_2020_ytb.graphml
With highest in-degree centrality: Olympics
Subgraph saved as in_degree_author_2016_ytb.graphml
With highest in-degree centrality: Olympics
Subgraph saved as in_degree_author_olympic_ytb.graphml


In [16]:
#find the largest betweenness degree-for video
extract_max_betweenness_degree_subgraph(video_youtubeG_olympic2024, video_centrality_2024, "betweenness_video_2024_ytb.graphml")
extract_max_betweenness_degree_subgraph(video_youtubeG_olympic2020, video_centrality_2020, "betweenness_video_2020_ytb.graphml")
extract_max_betweenness_degree_subgraph(video_youtubeG_olympic2016, video_centrality_2016, "betweenness_video_2016_ytb.graphml")
extract_max_betweenness_degree_subgraph(video_youtubeG_olympic, video_centrality_all, "betweenness_video_olympic_ytb.graphml")

With highest betweenness-degree centrality: THANK YOU PARIS! Closing Ceremony Highlights | #Paris2024
Subgraph saved as betweenness_video_2024_ytb.graphml
With highest betweenness-degree centrality: @francisnguyen6349
Subgraph saved as betweenness_video_2020_ytb.graphml
With highest betweenness-degree centrality: @Olympics
Subgraph saved as betweenness_video_2016_ytb.graphml
With highest betweenness-degree centrality: Men&#39;s 100m final 🏃‍♂️ | Tokyo Replays
Subgraph saved as betweenness_video_olympic_ytb.graphml


In [17]:
#find the largestin-degree degree-for video
extract_max_in_degree_subgraph(video_youtubeG_olympic2024, video_centrality_2024, "in_degree_video_2024_ytb.graphml")
extract_max_in_degree_subgraph(video_youtubeG_olympic2020, video_centrality_2020, "in_degree_video_2020_ytb.graphml")
extract_max_in_degree_subgraph(video_youtubeG_olympic2016, video_centrality_2016, "in_degree_video_2016_ytb.graphml")
extract_max_in_degree_subgraph(video_youtubeG_olympic, video_centrality_all, "in_degree_video_olympic_ytb.graphml")

With highest in-degree centrality: Full Opening Ceremony ✨| Full Replay | Paris Replays
Subgraph saved as in_degree_video_2024_ytb.graphml
With highest in-degree centrality: 🇮🇳🥇 Neeraj Chopra wins historic gold for India | #Tokyo2020 Highlights
Subgraph saved as in_degree_video_2020_ytb.graphml
With highest in-degree centrality: Men&#39;s 100m Final | Rio 2016 Replay
Subgraph saved as in_degree_video_2016_ytb.graphml
With highest in-degree centrality: 🇮🇳🥇 Neeraj Chopra wins historic gold for India | #Tokyo2020 Highlights
Subgraph saved as in_degree_video_olympic_ytb.graphml


In [18]:
#write the graphml for video and author
youtubeG_olympic_video= write_graphml(video_youtubeG_olympic,"youtubeG_olympic_video.graphml")
youtubeG_olympic_author = write_graphml(author_youtubeG_olympic,"youtubeG_olympic_author.graphml")

Subgraph saved as youtubeG_olympic_video.graphml
Subgraph saved as youtubeG_olympic_author.graphml


In [19]:
print("Author centrality Olympics\n:", author_centrality_all.describe().to_string())
print("Video centrality Olympics\n:", video_centrality_all.describe().to_string())

Author centrality Olympics
:        in_degree_centrality  degree_centrality  betweenness_centrality  closeness_centrality  eigenvector_centrality
count            476.000000         476.000000              476.000000            476.000000              476.000000
mean               0.011835           0.023671                0.004013              0.357763                0.032035
std                0.032613           0.065226                0.034050              0.055285                0.032816
min                0.002105           0.004211                0.000000              0.203513                0.000126
25%                0.006316           0.012632                0.000283              0.303224                0.011712
50%                0.006316           0.012632                0.000531              0.388389                0.038242
75%                0.008421           0.016842                0.001066              0.397989                0.041379
max                0.614737        

In [48]:
def build_youtube_graph_video_com(subset):
    # Create an empty graph for the Reddit posts and comments
    youtubeG = nx.Graph()

    # Iterate through each post in the dataset
    for video in subset:
        if video['title']:
            # Add the video node with attributes
            youtubeG.add_node(video['title'], type='video', hashtag=str(video['olympics']) if video['olympics'] else "")
            
            # Add nodes and edges for each comment author and link them to the corresponding video
            for comment in video['comments']:
                author = comment.get('author', '').strip()
                    
                if author:
                    if not youtubeG.has_node(author):
                        youtubeG.add_node(author, type='author')
                        
                    # Add a directed edge from author to the video they commented on
                    youtubeG.add_edge(author, video['title'])
                    
    # Perform community detection using the Louvain method
    partition = community.best_partition(youtubeG)

    nx.set_node_attributes(youtubeG, partition, 'community')

    return youtubeG

In [49]:
# get the most influential 
author_name = '@francisnguyen6349'
youtubeG_select = build_youtube_graph_video_com(olympics_post2)
partition = community.best_partition(youtubeG_select)

# Find the community for the selected user
user_community_id = partition.get(author_name)

# Check if the user exists and has a community
if user_community_id is not None:
    user_edges = [
        (u, v) for u, v in youtubeG_select.edges() if u == author_name or v == author_name
    ]
    
    subgraph = youtubeG_select.edge_subgraph(user_edges)

    output_file = f'{author_name}_interaction_video_youtube.graphml'
    nx.write_graphml(subgraph, output_file)
    print(f"Interaction graph for {author_name} exported to {output_file}")
else:
    print(f"The user '{author_name}' does not exist in the graph or has no assigned community.")


Interaction graph for @francisnguyen6349 exported to @francisnguyen6349_interaction_video_youtube.graphml


### Independent Cascade for Most Active User in Youtube

In [50]:
def build_youtube_graph_author_com(subset):
    # Create an empty graph for the Reddit posts and comments
    youtubeG = nx.Graph()

    # Iterate through each post in the dataset
    for video in subset:
        if video['author']:
            youtubeG.add_node(video['author'], type='author', hashtag=str(video['olympics']) if video['olympics'] else "")
            
            # Add nodes and edges for each comment author and link them to the corresponding video
            for comment in video['comments']:
                author = comment.get('author', '').strip()
                    
                if author:
                    if not youtubeG.has_node(author):
                        youtubeG.add_node(author, type='author')
                        
                    # Add a directed edge from author to the video they commented on
                    youtubeG.add_edge(author, video['author'])
                    
    # Perform community detection using the Louvain method
    partition = community.best_partition(youtubeG)

    # Assign the community information as an attribute to each node
    nx.set_node_attributes(youtubeG, partition, 'community')

    # Return the graph with community detection information
    return youtubeG

In [51]:
# get the most influential 
author_name = '@francisnguyen6349'
youtubeG_select = build_youtube_graph_author_com(olympics_post2)
partition = community.best_partition(youtubeG_select)

# Find the community for the selected user
user_community_id = partition.get(author_name)

# find the interacted posts only
# Check if the user exists and has a community
if user_community_id is not None:
    user_edges = [
        (u, v) for u, v in youtubeG_select.edges() if u == author_name or v == author_name
    ]
    
    subgraph = youtubeG_select.edge_subgraph(user_edges)
    
    output_file = f'{author_name}_interaction_author_youtube.graphml'
    nx.write_graphml(subgraph, output_file)
    print(f"Interaction graph for {author_name} exported to {output_file}")
else:
    print(f"The user '{author_name}' does not exist in the graph or has no assigned community.")

Interaction graph for @francisnguyen6349 exported to @francisnguyen6349_interaction_author_youtube.graphml


In [52]:
def independentCascade(graph, trialNum, lSeed, activationProb):
    """
    Performs independent cascade over the input graph.  Results are stored in two output
    lists.

    @param graph: Input graph to perform cascade over.
    @param trialNum: The number of runs/trials to run.  The results are averaged over the
                    the trials/runs.
    @param lSeed: List of initial nodes to seed.  Range from 0 to number of nodes -1.
    @param activationProb: Activation probability on each edge.  All edges have the same
                    activation probability.

    @return: Two lists, lAvgActivationsPerNode and lAvgActivationsPerIteration.
            lAvgActivationsPerNode is a list with the size same as the number of nodes in
            the graph.  Each index of the list (starting with zero) corresponds directly
            to the associated node, and each entry represents the average number of activations
            over the trials/runs, and should lie in [0,1] range.
            lAvgActivationsPerIteration is a list with the size same as the number of trials/runs.
            Each index of the list corresponds to a trial/run, and each entry is the
            total number of active nodes in that trial/run.
    """

    # generate initial lists/vectors for the two output lists
    lAvgActivationsPerNode = [0 for x in range(nx.number_of_nodes(graph))]
    lAvgActivationsPerIteration = []

    # Map node IDs to list indices
    node_to_index = {node: idx for idx, node in enumerate(graph.nodes())}

    print('starting cascade run')
    # loop through the runs/trials
    for i in range(trialNum):
        print('Trial/run no. {}'.format(i))

        #
        # TODO: complete implemention
        #
        
        # list of active nodes
        setActive = set(lSeed)
        setLastActive = set(lSeed)
        setNewActive = set()
        # we keep looping until no more new activations
        while len(setLastActive) > 0:
            # for each active node, we try to influence its (unactived neighbours)
            for currNode in setLastActive:
                # check each neighbour
                for neighbour in graph.neighbors(currNode):
                    # we only want non-active neighbours
                    if neighbour not in setActive and neighbour not in setNewActive:
                        if random.random() < activationProb:
                            setNewActive.add(neighbour)

            # update last active
            setLastActive = setNewActive
            # extend active set
            setActive.update(setNewActive)
            # reset new active
            setNewActive = set()

        # update the output lists
        for x in setActive:
            lAvgActivationsPerNode[node_to_index[x]] += 1

        # update with total number of activations
        lAvgActivationsPerIteration.append(len(setActive))

    # placeholder, replace with appropriate returns (if necessary)
    # we average each entry in lAvgActivationsPerNode by number of runs/trials
    return [float(count) / trialNum for count in lAvgActivationsPerNode], lAvgActivationsPerIteration


In [53]:
# fileName
author_name = '@francisnguyen6349'
sFilenameSuffix = f'{author_name}_independent_cascade.graphml'

# tree graph
treeGraph = nx.read_graphml('@francisnguyen6349_interaction_author_youtube.graphml')

# small world graph
smallWorldGraph = nx.read_graphml('@francisnguyen6349_interaction_author_youtube.graphml')

#
# Independent cascade
#
lSeed = list(treeGraph.nodes())[:2]  # Get first two nodes as seeds
trialNum = 10
activationProb = 0.5

#
# independent cascade on tree graph

if treeGraph != None:
    lAvgActivationsPerNode, lAvgActivationsPerIteration = independentCascade(treeGraph, trialNum, lSeed, activationProb)
    print(lAvgActivationsPerNode)
    print(lAvgActivationsPerIteration)
    print('Average number of nodes activated = {} out of {}'.format(sum(lAvgActivationsPerIteration) / len(lAvgActivationsPerIteration), nx.number_of_nodes(treeGraph)))


    # Save to graph
    # average activation per node for balanced tree,
    # stored in node attribute 'avgAct'
    # Save the average activation per node for balanced tree
    # use zip and nodes since the node ids are not numbers
    for node, avgActivation in zip(treeGraph.nodes(), lAvgActivationsPerNode):
        treeGraph.nodes[node]['avgAct'] = avgActivation


    # Output modified graphs to respective files
    nx.readwrite.write_graphml(treeGraph, 'tree_' + sFilenameSuffix, infer_numeric_types=True)

#
# small world graph
#

if smallWorldGraph != None:
    lAvgActivationsPerNode, lAvgActivationsPerIteration = independentCascade(smallWorldGraph, trialNum, lSeed, activationProb)
    print(lAvgActivationsPerNode)
    print(lAvgActivationsPerIteration)
    print('Average number of nodes activated = {} out of {}'.format(sum(lAvgActivationsPerIteration) / len(lAvgActivationsPerIteration), nx.number_of_nodes(smallWorldGraph)))


    # average activation per node for small world graph,
    # stored in node attribute 'avgAct'
    # use zip and nodes since the node ids are not numbers
    for node, avgActivation in zip(smallWorldGraph.nodes(), lAvgActivationsPerNode):
        smallWorldGraph.nodes[node]['avgAct'] = avgActivation

    # Output modified graphs to respective files
    nx.readwrite.write_graphml(smallWorldGraph, 'smallWorld_' + sFilenameSuffix, infer_numeric_types=True)


starting cascade run
Trial/run no. 0
Trial/run no. 1
Trial/run no. 2
Trial/run no. 3
Trial/run no. 4
Trial/run no. 5
Trial/run no. 6
Trial/run no. 7
Trial/run no. 8
Trial/run no. 9
[1.0, 1.0, 0.3, 0.6, 0.8, 0.4, 0.4, 0.2, 0.7, 0.6, 0.1, 0.3, 0.5, 0.5, 0.3, 0.3]
[7, 2, 9, 11, 10, 9, 11, 2, 10, 9]
Average number of nodes activated = 8.0 out of 16
starting cascade run
Trial/run no. 0
Trial/run no. 1
Trial/run no. 2
Trial/run no. 3
Trial/run no. 4
Trial/run no. 5
Trial/run no. 6
Trial/run no. 7
Trial/run no. 8
Trial/run no. 9
[1.0, 1.0, 0.5, 0.4, 0.7, 0.6, 0.3, 0.6, 0.4, 0.4, 0.5, 0.3, 0.5, 0.2, 0.5, 0.2]
[13, 11, 2, 12, 2, 10, 10, 2, 11, 8]
Average number of nodes activated = 8.1 out of 16


### Linear Threshold Influence Approach

In [54]:
def linearThreshold(graph, trialNum, lSeed):
    """
    Performs linear threshold model over the input directed graph.  Results are stored in two output
    lists.

    @param graph: Input graph to perform the LT model over.
    @param trialNum: The number of runs/trials to run.  The results are averaged over the
                    the trials/runs.
    @param lSeed: List of initial nodes to seed.  Range from 0 to number of nodes -1.

    @return: Two lists, lAvgActivationsPerNode and lAvgActivationsPerIteration.
            lAvgActivationsPerNode is a list with the size same as the number of nodes in
            the graph.  Each index of the list (starting with zero) corresponds directly
            to the associated node, and each entry represents the average number of activations
            over the trials/runs, and should lie in [0,1] range.
            lAvgActivationsPerIteration is a list with the size same as the number of trials/runs.
            Each index of the list corresponds to a trial/run, and each entry is the
            total number of active nodes in that trial/run.
    """

    # generate initial lists/vectors for the two output lists
    lAvgActivationsPerNode = {node: 0 for node in graph.nodes()}
    lAvgActivationsPerIteration = []

    # Map node IDs to list indices
    node_to_index = {node: idx for idx, node in enumerate(graph.nodes())}

    print('starting linear threshold runs')
    # loop through the runs/trials
    for i in range(trialNum):
        print('Trial/run no. {}'.format(i))

        # for each node, generate the random thresholds
        for currNode, attr in graph.nodes(data=True):
            attr['threshold'] = random.random()

        # list of active nodes
        setActive = set(lSeed)
        setLastActive = set(lSeed)
        setNewActive = set()
        # we keep looping until no more new activations
        while len(setLastActive) > 0:
            # we get all the nodes next to the current set of active nodes
            neighbourSet = set()
            for activeNode in setLastActive:
                neighbourSet.update([neighbour for neighbour in graph.successors(activeNode) if neighbour not in setActive and neighbour not in setNewActive])

            # for each of these potential neighbours to be activated, test if it will be activated
            for neighbour in neighbourSet:
                try:
                    # get the sum of weights
                    weightTotal = sum([dataDict['weight'] for (u,v, dataDict) in graph.in_edges(neighbour, data=True)])
                    # test against the node threshold
                    if graph.nodes[neighbour]['threshold'] < weightTotal:
                        setNewActive.add(neighbour)
                except KeyError as e:
                    print("Key error: {} is missing for edge".format(e, (u,v)))

            # update last active
            setLastActive = setNewActive
            # extend active set
            setActive.update(setNewActive)
            # reset new active
            setNewActive = set()

        # update the output lists
        for x in setActive:
            lAvgActivationsPerNode[x] += 1
            
        # update with total number of activations
        lAvgActivationsPerIteration.append(len(setActive))

    # we average each entry in lAvgActivationsPerNode by number of runs/trials
    # Return the dictionary instead of the list
    return {node: float(count) / trialNum for node, count in lAvgActivationsPerNode.items()}, lAvgActivationsPerIteration


In [31]:
def generateWeights(graph):
    """
    Generate weights for the edges.

    @param graph: directed graph to generate weights on the edges.
    @return: modified directed graph with weights on edges, under attribute 'weight'
    """

    for currNode in graph.nodes():
        # generate the number that the weights should sum up to
        totalWeight = random.random()
        # use dirichlet distribution to generate the weights
        aWeights = np.random.dirichlet(np.ones(graph.in_degree(currNode)), size=1) * totalWeight
        lWeights = aWeights[0].tolist()

        for i,u in enumerate(graph.predecessors(currNode)):
            graph.add_edge(u,currNode,weight=lWeights[i])

In [55]:
# fileName
author_name = '@francisnguyen6349'
sFilenameSuffix = f'{author_name}_linear_threshold.graphml'

# tree graph
undirectedTreeGraph = nx.read_graphml('@francisnguyen6349_interaction_author_youtube.graphml')
treeGraph = undirectedTreeGraph.to_directed()
generateWeights(treeGraph)


# small world graph
undirectedSmallWorldGraph = nx.read_graphml('@francisnguyen6349_interaction_author_youtube.graphml')
smallWorldGraph = undirectedSmallWorldGraph.to_directed()
generateWeights(smallWorldGraph)


#
# Linear threshold
#
seedNum = 3
lSeed = list(treeGraph.nodes())[:2]  # Get first two nodes as seeds
trialNum = 10


#
# TODO: complete the implementation of the linear threshold model with function
# linearThreshold()
#

#
# tree graph
#

if treeGraph != None:
    lAvgActivationsPerNode, lAvgActivationsPerIteration = linearThreshold(treeGraph, trialNum, lSeed)
    print(lAvgActivationsPerNode)
    print(lAvgActivationsPerIteration)
    if len(lAvgActivationsPerIteration) > 0:
        print('Average number of nodes activated = {} out of {}'.format(sum(lAvgActivationsPerIteration) / len(lAvgActivationsPerIteration), nx.number_of_nodes(treeGraph)))
    else:
        print('Average number of nodes activated = {} out of {}'.format(0, nx.number_of_nodes(treeGraph)))


    # average activation per node for small world graph,
    # stored in node attribute 'avgAct'
    for nodeId, avgActivation in lAvgActivationsPerNode.items():
        treeGraph.nodes[nodeId]['avgAct'] = avgActivation

    # Output modified graphs to respective files
    nx.readwrite.write_graphml(treeGraph, 'treeLT_' + sFilenameSuffix, infer_numeric_types=True)

#
# small world graph
#

if smallWorldGraph != None:
    lAvgActivationsPerNode, lAvgActivationsPerIteration = linearThreshold(smallWorldGraph, trialNum, lSeed)
    print(lAvgActivationsPerNode)
    print(lAvgActivationsPerIteration)
    if len(lAvgActivationsPerIteration) > 0:
        print('Average number of nodes activated = {} out of {}'.format(sum(lAvgActivationsPerIteration) / len(lAvgActivationsPerIteration), nx.number_of_nodes(smallWorldGraph)))
    else:
        print('Average number of nodes activated = {} out of {}'.format(0, nx.number_of_nodes(smallWorldGraph)))


    # average activation per node for small world graph,
    # stored in node attribute 'avgAct'
    for nodeId, avgActivation in lAvgActivationsPerNode.items():
        smallWorldGraph.nodes[nodeId]['avgAct'] = avgActivation

    # Output modified graphs to respective files
    nx.readwrite.write_graphml(smallWorldGraph, 'smallWorldLT_' + sFilenameSuffix, infer_numeric_types=True)


starting linear threshold runs
Trial/run no. 0
Trial/run no. 1
Trial/run no. 2
Trial/run no. 3
Trial/run no. 4
Trial/run no. 5
Trial/run no. 6
Trial/run no. 7
Trial/run no. 8
Trial/run no. 9
{'Benedict': 1.0, 'StartPlayX': 1.0, 'Lady Gaga': 0.1, 'NeoTechnoman': 0.1, '@francisnguyen6349': 0.4, 'Olympics Aquatics': 0.0, 'KokiriGaming': 0.4, 'ProsafiaGaming': 0.3, 'Paralympic Games': 0.4, 'Olympics': 0.1, 'JinnaGaming': 0.0, 'NBC Sports': 0.2, 'Games Variety': 0.0, 'Nintendo Utopia': 0.4, 'Olympics Gymnastics': 0.1, 'Zacnary': 0.3}
[9, 2, 2, 2, 9, 2, 9, 2, 2, 9]
Average number of nodes activated = 4.8 out of 16
starting linear threshold runs
Trial/run no. 0
Trial/run no. 1
Trial/run no. 2
Trial/run no. 3
Trial/run no. 4
Trial/run no. 5
Trial/run no. 6
Trial/run no. 7
Trial/run no. 8
Trial/run no. 9
{'Benedict': 1.0, 'StartPlayX': 1.0, 'Lady Gaga': 0.5, 'NeoTechnoman': 0.4, '@francisnguyen6349': 0.8, 'Olympics Aquatics': 0.5, 'KokiriGaming': 0.5, 'ProsafiaGaming': 0.4, 'Paralympic Games': 

### Mean of Centrality Measures Across Olympic Events

In [33]:
# author
author_centrality_2016_mean = author_centrality_2016.mean()
author_centrality_2020_mean = author_centrality_2020.mean()
author_centrality_2024_mean = author_centrality_2024.mean()
all_author_centrality_means = pd.concat([author_centrality_2016_mean, author_centrality_2020_mean, author_centrality_2024_mean], axis=1)
all_author_centrality_means = all_author_centrality_means.rename(columns={0: 'Olympics 2016', 1: 'Olympics 2020', 2: 'Olympics 2024'})


# video
video_centrality_2016_mean = video_centrality_2016.mean()
video_centrality_2020_mean = video_centrality_2020.mean()
video_centrality_2024_mean = video_centrality_2024.mean()
all_video_centrality_means = pd.concat([video_centrality_2016_mean, video_centrality_2020_mean, video_centrality_2024_mean], axis=1)
all_video_centrality_means = all_video_centrality_means.rename(columns={0: 'Olympics 2016', 1: 'Olympics 2020', 2: 'Olympics 2024'})

print('For author centrality:')
print(all_author_centrality_means, '\n')
print('For video centrality:')
print(all_video_centrality_means)


For author centrality:
                        Olympics 2016  Olympics 2020  Olympics 2024
in_degree_centrality         0.318182       0.040969       0.028292
degree_centrality            0.636364       0.081939       0.056584
betweenness_centrality       0.092727       0.016773       0.010571
closeness_centrality         0.529180       0.342183       0.361191
eigenvector_centrality       0.276653       0.064549       0.056079 

For video centrality:
                        Olympics 2016  Olympics 2020  Olympics 2024
in_degree_centrality         0.040464       0.006229       0.010676
degree_centrality            0.080929       0.012458       0.021353
betweenness_centrality       0.014948       0.002639       0.005123
closeness_centrality         0.323899       0.283770       0.275410
eigenvector_centrality       0.067797       0.017461       0.028301


In [34]:
author_centrality_2016

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
Olympics,0.636364,1.272727,0.380407,0.733333,0.484901
@tekkenfan01,0.272727,0.545455,0.066121,0.52381,0.273509
@93hothead,0.272727,0.545455,0.029082,0.52381,0.291544
@1990Thunderbolt,0.272727,0.545455,0.029082,0.52381,0.291544
The NZ Team,0.181818,0.363636,0.015195,0.407407,0.125543
@stt5v2002,0.272727,0.545455,0.08619,0.52381,0.230756
@redzoom7857,0.272727,0.545455,0.108,0.52381,0.252133
@digitalhouse6969,0.272727,0.545455,0.066121,0.52381,0.273509
Jomboy Media,0.363636,0.727273,0.105455,0.52381,0.277136
@longlee1100,0.272727,0.545455,0.062675,0.52381,0.252133


In [35]:
author_centrality_2020

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
Olympics,0.472441,0.944882,0.369497,0.533613,0.529522
@kankeihatsu291,0.023622,0.047244,0.002686,0.362857,0.070549
@cirnosnumberfan6449,0.023622,0.047244,0.010733,0.389571,0.060372
@manojkumar-xl9ez,0.023622,0.047244,0.000166,0.352778,0.098099
@rhapsody8883,0.023622,0.047244,0.000166,0.352778,0.098099
...,...,...,...,...,...
ZerkaaPlays,0.023622,0.047244,0.001090,0.317500,0.016058
Namewee,0.070866,0.141732,0.009700,0.317500,0.033897
@aaliyahrosado7365,0.023622,0.047244,0.004194,0.279736,0.009153
Cody Miller,0.015748,0.031496,0.000000,0.262397,0.016116


In [36]:
author_centrality_2024

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
Olympics,0.519337,1.038674,0.538946,0.560372,0.527974
@francisnguyen6349,0.016575,0.033149,0.001620,0.410431,0.074895
@tasmanndrive,0.016575,0.033149,0.022497,0.368635,0.046639
@awaleahmed8698,0.016575,0.033149,0.028291,0.385928,0.058350
BBC,0.044199,0.088398,0.001936,0.327306,0.026479
...,...,...,...,...,...
dridri,0.016575,0.033149,0.000104,0.301165,0.010825
Vox,0.033149,0.066298,0.000937,0.324955,0.023642
TFC Stadiums,0.033149,0.066298,0.000462,0.318102,0.032692
Firstpost,0.154696,0.309392,0.130162,0.370143,0.132324


In [39]:
author_centrality_all.sort_values(by = ['degree_centrality'], ascending = False)

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
Olympics,0.614737,1.229474,0.717285,0.610540,0.621084
NBC Sports,0.183158,0.366316,0.081397,0.388072,0.149667
Olympics Aquatics,0.128421,0.256842,0.034025,0.364823,0.120252
Yahoo Australia,0.120000,0.240000,0.014776,0.350812,0.095717
BBC,0.105263,0.210526,0.018527,0.359304,0.081436
...,...,...,...,...,...
GMA Sports PH,0.002105,0.004211,0.000000,0.222378,0.000522
Tehrian Nona,0.002105,0.004211,0.000000,0.276163,0.001735
NeoTechnoman,0.002105,0.004211,0.000000,0.307245,0.003182
Victory Vibez,0.002105,0.004211,0.000000,0.287879,0.002114


In [40]:
author_centrality_all.sort_values(by = ['in_degree_centrality'], ascending = False)

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
Olympics,0.614737,1.229474,0.717285,0.610540,0.621084
NBC Sports,0.183158,0.366316,0.081397,0.388072,0.149667
Olympics Aquatics,0.128421,0.256842,0.034025,0.364823,0.120252
Yahoo Australia,0.120000,0.240000,0.014776,0.350812,0.095717
BBC,0.105263,0.210526,0.018527,0.359304,0.081436
...,...,...,...,...,...
GMA Sports PH,0.002105,0.004211,0.000000,0.222378,0.000522
Tehrian Nona,0.002105,0.004211,0.000000,0.276163,0.001735
NeoTechnoman,0.002105,0.004211,0.000000,0.307245,0.003182
Victory Vibez,0.002105,0.004211,0.000000,0.287879,0.002114


In [41]:
author_centrality_all.sort_values(by = ['betweenness_centrality'], ascending = False)

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
Olympics,0.614737,1.229474,0.717285,0.610540,0.621084
Zephiel810,0.033684,0.067368,0.111640,0.317089,0.011598
NBC Sports,0.183158,0.366316,0.081397,0.388072,0.149667
Lady Gaga,0.092632,0.185263,0.076591,0.356607,0.070036
@francisnguyen6349,0.031579,0.063158,0.068557,0.443097,0.060837
...,...,...,...,...,...
Sports Today,0.004211,0.008421,0.000000,0.292128,0.004042
GMA Sports PH,0.002105,0.004211,0.000000,0.222378,0.000522
7thave,0.006316,0.012632,0.000000,0.286490,0.005730
NeoTechnoman,0.002105,0.004211,0.000000,0.307245,0.003182
