In [1]:
import json
import networkx as nx
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import community
import random
import csv

import nltk
import string
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# load json file
fJsonName = 'olympics_posts_reddit.json'

# Load the JSON data
with open(fJsonName, 'r') as f:
    reddit_data = json.load(f)
    
reddit_post = []
# Create a combined data 
for sub in reddit_data[1:]:
    submissions = {
        'submissionID': sub['id'],
        'keyword': sub['keyword'],
        'title': sub['title'],
        'selftext': sub['selftext'],
        'author': sub['author_id'],
        'comments': []
    }
    for comment in sub['comments']:
        comments = {
            'commentID': comment['id'],
            'commentBody': comment['body'],
            'created': pd.to_datetime(comment['created'], unit='s'),
            'author': comment['author_id']
        }
        submissions['comments'].append(comments)
    reddit_post.append(submissions)


In [3]:
# Lists to store the subsets
Olympics2024 = []
Olympics2020 = []
Olympics2016 = []


for keywords in reddit_post:
    
    if keywords['keyword'] == "Olympics 2024":
        Olympics2024.append(keywords)
    elif keywords['keyword'] == "Olympics 2020":
        Olympics2020.append(keywords)
    elif keywords['keyword'] == "Olympics 2016":
        Olympics2016.append(keywords)

# Combine all subsets into a final list
combined_subset = Olympics2024 + Olympics2020 + Olympics2016

In [4]:
def build_reddit_graph_author(subset):
    # Create an empty graph for the Reddit posts and comments
    redditG = nx.Graph()

    # Iterate through each post in the dataset
    for sub in subset:
        # Ensure the post has an author before adding it as a node
        if sub['author']:
            redditG.add_node(sub['author'].strip(), type='author' , keyword = str(sub['keyword']) if sub['keyword'] else "")

            # Iterate through the comments of the post and add comment authors as nodes
            for comment in sub['comments']:
                author = comment.get('author', '').strip()

                # Ensure the comment author is not empty before adding it as a node
                if author:
                    if not redditG.has_node(author):
                        redditG.add_node(author, type='author')

                    # Create an edge between the post author and the comment author
                    redditG.add_edge(author, sub['author'].strip())

    # Perform community detection using the Louvain method
    partition = community.best_partition(redditG)

    # Assign the community information as an attribute to each node
    nx.set_node_attributes(redditG, partition, 'community')

    # Return the graph with community detection information
    return redditG

In [5]:
def build_reddit_graph_post(subset):
    # Create an empty graph for the Reddit posts and comments
    redditG = nx.Graph()

    # Iterate through each post in the dataset
    for sub in subset:
        # Ensure the post has an title before adding it as a node
        if sub['title']:
            redditG.add_node(sub['title'], type='text', keyword = str(sub['keyword']) if sub['keyword'] else "" )
        
            # Iterate through the comments of the post and add comment authors as nodes
            for comment in sub['comments']:
                author = comment.get('author', '').strip()

                # Ensure the comment author is not empty before adding it as a node
                if author:
                    if not redditG.has_node(author):
                        redditG.add_node(author, type='author')

                    # Create an edge between the post author and the comment author
                    redditG.add_edge(author, sub['title'].strip())

    # Perform community detection using the Louvain method
    partition = community.best_partition(redditG)

    # Assign the community information as an attribute to each node
    nx.set_node_attributes(redditG, partition, 'community')

    # Return the graph with community detection information
    return redditG


In [6]:
def write_graphml(graph, graphfilename):
    nx.write_graphml(graph, graphfilename)
    print(f"Subgraph saved as {graphfilename}")

In [7]:
def compute_centrality_measures(graph, degree_threshold=1, betweenness_k=None):
    # Convert the graph to directed if it's not already directed
    if not graph.is_directed():
        graph = graph.to_directed()
    # Filter out nodes with a degree below the threshold
    lowdegree_nodes = [n for n, d in graph.degree() if d < degree_threshold]
    graph.remove_nodes_from(lowdegree_nodes)
    
    # largest connected component
    components = nx.weakly_connected_components(graph)
    largest_component = max(components, key=len)
    graph = graph.subgraph(largest_component)
    
    try:
        # Calculate various centrality measures
        in_degree_centrality = nx.in_degree_centrality(graph)
        degree_centrality = nx.degree_centrality(graph)
        betweenness_centrality = nx.betweenness_centrality(graph, k=betweenness_k) if betweenness_k else {}
        closeness_centrality = nx.closeness_centrality(graph)  # Works on undirected graphs
        eigenvector_centrality = nx.eigenvector_centrality(graph, max_iter=1000)
        
        # Combine the centrality measures into a DataFrame
        centrality_df = pd.DataFrame({
            'in_degree_centrality': in_degree_centrality,
            'degree_centrality':degree_centrality,
            'betweenness_centrality': betweenness_centrality,
            'closeness_centrality': closeness_centrality,
            'eigenvector_centrality': eigenvector_centrality
        })

    
    except nx.NetworkXError as e:
        print(f"Centrality Calculation Error: {e}")
        centrality_df = pd.DataFrame()
        
    return centrality_df

In [8]:
# Example usage for different graphs
author_redditG_olympic2024 = build_reddit_graph_author(Olympics2024)
author_redditG_olympic2020 = build_reddit_graph_author(Olympics2020)
author_redditG_olympic2016 = build_reddit_graph_author(Olympics2016)
author_redditG_olympic = build_reddit_graph_author(combined_subset)

author_centrality_2024 = compute_centrality_measures(author_redditG_olympic2024, degree_threshold=3,betweenness_k=10)
author_centrality_2020 = compute_centrality_measures(author_redditG_olympic2020, degree_threshold=3,betweenness_k=10)
author_centrality_2016 = compute_centrality_measures(author_redditG_olympic2016, degree_threshold=3,betweenness_k=10)
author_centrality_all = compute_centrality_measures(author_redditG_olympic, degree_threshold=3,betweenness_k=10)

In [9]:
# Example usage for different graphs
post_redditG_olympic2024 = build_reddit_graph_post(Olympics2024)
post_redditG_olympic2020 = build_reddit_graph_post(Olympics2020)
post_redditG_olympic2016 = build_reddit_graph_post(Olympics2016)
post_redditG_olympic = build_reddit_graph_post(combined_subset)

post_centrality_2024 = compute_centrality_measures(post_redditG_olympic2024, degree_threshold=3,betweenness_k=10)
post_centrality_2020 = compute_centrality_measures(post_redditG_olympic2020, degree_threshold=3,betweenness_k=10)
post_centrality_2016 = compute_centrality_measures(post_redditG_olympic2016, degree_threshold=3,betweenness_k=10)
post_centrality_all = compute_centrality_measures(post_redditG_olympic, degree_threshold=3,betweenness_k=10)

In [10]:
#write the graphml for post and author
redditG_olympic_post= write_graphml(post_redditG_olympic,"redditG_olympic_post.graphml")
redditG_olympic_author = write_graphml(author_redditG_olympic,"redditG_olympic_author.graphml")

Subgraph saved as redditG_olympic_post.graphml
Subgraph saved as redditG_olympic_author.graphml


In [11]:
# # drop all N/A due to the users probably has delete their Redddit account
# def user_drop(centrality_set):
#     centrality_set = centrality_set.drop('N/A')

In [12]:
def extract_max_betweenness_degree_subgraph(graph, centrality_df, output_filename):
    # Find the author with the maximum betweenness-degree centrality
    # drop all N/A due to the users probably has delete their Redddit account
    centrality_df = centrality_df.drop('N/A')
    max_betweenness_degree_author = centrality_df['betweenness_centrality'].idxmax()
    print(f"Author with highest betweenness-degree centrality: {max_betweenness_degree_author}")
    
    # Get the subgraph of this author and their neighbors (immediate connections)
    subgraph = graph.subgraph([max_betweenness_degree_author] + list(graph.neighbors(max_betweenness_degree_author)))
    
    # Save the subgraph as a GraphML file
    nx.write_graphml(subgraph, output_filename)
    print(f"Subgraph saved as {output_filename}")

In [13]:
def extract_max_in_degree_subgraph(graph, centrality_df, output_filename):
    # Find the author with the maximum in-degree centrality
    # drop all N/A due to the users probably has delete their Redddit account
    centrality_df = centrality_df.drop('N/A')
    centrality_df = centrality_df.drop('AutoModerator')
    max_in_degree_author = centrality_df['in_degree_centrality'].idxmax()
    print(f"Author with highest in-degree centrality: {max_in_degree_author}")
    
    # Get the subgraph of this author and their neighbors (immediate connections)
    subgraph = graph.subgraph([max_in_degree_author] + list(graph.neighbors(max_in_degree_author)))
    
    # Save the subgraph as a GraphML file
    nx.write_graphml(subgraph, output_filename)
    print(f"Subgraph saved as {output_filename}")

In [14]:
#find the largest betweenness degree-for author
extract_max_betweenness_degree_subgraph(author_redditG_olympic2024, author_centrality_2024, "betweenness_author_2024.graphml")
extract_max_betweenness_degree_subgraph(author_redditG_olympic2020, author_centrality_2020, "betweenness_author_2020.graphml")
extract_max_betweenness_degree_subgraph(author_redditG_olympic2016, author_centrality_2016, "betweenness_author_2016.graphml")
extract_max_betweenness_degree_subgraph(author_redditG_olympic, author_centrality_all, "betweenness_author_olympic.graphml")

Author with highest betweenness-degree centrality: puppuli
Subgraph saved as betweenness_author_2024.graphml
Author with highest betweenness-degree centrality: urfaselol
Subgraph saved as betweenness_author_2020.graphml
Author with highest betweenness-degree centrality: snakes_on_a_planet
Subgraph saved as betweenness_author_2016.graphml
Author with highest betweenness-degree centrality: Brady331
Subgraph saved as betweenness_author_olympic.graphml


In [15]:
#find the largest betweenness degree-for post
extract_max_betweenness_degree_subgraph(post_redditG_olympic2024, post_centrality_2024, "betweenness_post_2024.graphml")
extract_max_betweenness_degree_subgraph(post_redditG_olympic2020, post_centrality_2020, "betweenness_post_2020.graphml")
extract_max_betweenness_degree_subgraph(post_redditG_olympic2016, post_centrality_2016, "betweenness_post_2016.graphml")
extract_max_betweenness_degree_subgraph(post_redditG_olympic, post_centrality_all, "betweenness_post_olympic.graphml")

Author with highest betweenness-degree centrality: Stunning venues at the Paris Olympics 2024
Subgraph saved as betweenness_post_2024.graphml
Author with highest betweenness-degree centrality: Quidditch at the 2020 Olympics?
Subgraph saved as betweenness_post_2020.graphml
Author with highest betweenness-degree centrality: India at Olympics 2024: Day 5
Subgraph saved as betweenness_post_2016.graphml
Author with highest betweenness-degree centrality: Rio 2016: Olympic athletes told to ‘keep your mouth closed’ when in contaminated water
Subgraph saved as betweenness_post_olympic.graphml


In [16]:
#find the largest in-degree-for author
extract_max_in_degree_subgraph(author_redditG_olympic2024, author_centrality_2024, "in_degree_author_2024.graphml")
extract_max_in_degree_subgraph(author_redditG_olympic2020, author_centrality_2020, "in_degree_author_2020.graphml")
extract_max_in_degree_subgraph(author_redditG_olympic2016, author_centrality_2016, "in_degree_author_2016.graphml")
extract_max_in_degree_subgraph(author_redditG_olympic, author_centrality_all, "in_degree_author_olympic.graphml")

Author with highest in-degree centrality: padfoony
Subgraph saved as in_degree_author_2024.graphml
Author with highest in-degree centrality: nick168
Subgraph saved as in_degree_author_2020.graphml
Author with highest in-degree centrality: snakes_on_a_planet
Subgraph saved as in_degree_author_2016.graphml
Author with highest in-degree centrality: puppuli
Subgraph saved as in_degree_author_olympic.graphml


In [17]:
#find the largest in-degree-for post
extract_max_in_degree_subgraph(post_redditG_olympic2024, post_centrality_2024, "in_degree_post_2024.graphml")
extract_max_in_degree_subgraph(post_redditG_olympic2020, post_centrality_2020, "in_degree_post_2020.graphml")
extract_max_in_degree_subgraph(post_redditG_olympic2016, post_centrality_2016, "in_degree_post_2016.graphml")
extract_max_in_degree_subgraph(post_redditG_olympic, post_centrality_all, "in_degree_post_olympic.graphml")

Author with highest in-degree centrality: Olympic Discussion Posts | WAG AA | 1 August 2024
Subgraph saved as in_degree_post_2024.graphml
Author with highest in-degree centrality: India at Olympics 2020: Day 15
Subgraph saved as in_degree_post_2020.graphml
Author with highest in-degree centrality: India at Olympics 2024: Day 7
Subgraph saved as in_degree_post_2016.graphml
Author with highest in-degree centrality: Olympic Discussion Posts | WAG AA | 1 August 2024
Subgraph saved as in_degree_post_olympic.graphml


In [56]:
# get the most influential users
author_name = 'puppuli'
redditG_select = author_redditG_olympic
partition = community.best_partition(redditG_select)

# Find the community for the selected user
user_community_id = partition.get(author_name)
# Check if the user exists and has a community
if user_community_id is not None:
    # Filter for user interactions (edges involving author_name)
    user_edges = [
        (u, v) for u, v in redditG_select.edges() if u == author_name or v == author_name
    ]
    
    # Create a subgraph containing only these user-specific interactions
    subgraph = redditG_select.edge_subgraph(user_edges)

    # Export the subgraph to GraphML
    output_file = f'{author_name}_community_reddit.graphml'
    nx.write_graphml(subgraph, output_file)
    print(f"Community graph for {author_name} exported to {output_file}")
else:
    print(f"The user '{author_name}' does not exist in the graph or has no assigned community.")

Community graph for puppuli exported to puppuli_community_reddit.graphml


In [19]:
# find the interacted posts only
redditG_select = post_redditG_olympic
partition = community.best_partition(redditG_select)
# Find the community for the selected user
user_community_id = partition.get(author_name)
# Check if the user exists and has a community
if user_community_id is not None:
    # Filter for user interactions (edges involving `author_name`)
    user_edges = [
        (u, v) for u, v in redditG_select.edges() if u == author_name or v == author_name
    ]
    
    #  Create a subgraph containing only these user-specific interactions
    subgraph = redditG_select.edge_subgraph(user_edges)
    
    #  Export the subgraph to GraphML
    output_file = f'{author_name}_interaction_posts_reddit.graphml'
    nx.write_graphml(subgraph, output_file)
    print(f"Interaction graph for {author_name} exported to {output_file}")
else:
    print(f"The user '{author_name}' does not exist in the graph or has no assigned community.")


Interaction graph for puppuli exported to puppuli_interaction_posts_reddit.graphml


In [20]:
# get the most influential post subgraph
reddit_title = '13 years, same mentality - World cup series 2011 (1st) and Paris olympics 2024 (2nd)'
redditG_select = post_redditG_olympic
partition = community.best_partition(redditG_select)

# Find the community for the selected user
title_community_id = partition.get(reddit_title)

# Check if the post exists in the graph
if reddit_title in redditG_select.nodes:
    # Find all edges related to the selected post (either as a source or target)
    title_edges = [
        (u, v) for u, v in redditG_select.edges() if u == reddit_title or v == reddit_title
    ]

    # Create a subgraph for the selected post and its related interactions
    subgraph = redditG_select.edge_subgraph(title_edges).copy()

    # Export the subgraph to GraphML
    output_file = f'{reddit_title}_interaction_posts_reddit.graphml'
    nx.write_graphml(subgraph, output_file)
    
    print(f"Interaction graph for '{reddit_title}' exported to {output_file}")
else:
    print(f"The post titled '{reddit_title}' does not exist in the graph.")


Interaction graph for '13 years, same mentality - World cup series 2011 (1st) and Paris olympics 2024 (2nd)' exported to 13 years, same mentality - World cup series 2011 (1st) and Paris olympics 2024 (2nd)_interaction_posts_reddit.graphml


In [21]:
#  Classify communities using LDA
reddit_df = pd.read_json('olympics_posts_reddit.json')
print(reddit_df.head())

        id                                              title selftext  score  \
0  1er8uik  Imane Khelif poses with her Gold medal after t...           52991   
1  1ef4lku  USA, a country so obsessed with guns, has so f...           76656   
2  1eeaa5m          Most controversial pic from olympics 2024           37933   
3  1f5ez3o     Paralympic breakdancing at PARIS Olympics 2024           58263   
4  1epqvq1  USA and China tie for most gold medals in the ...           25114   

      created           author_id  \
0  1723558157           cmaia1503   
1  1722272189           krnranger   
2  1722181161  SumneOndHakbekalva   
3  1725078346            RPT4STIC   
4  1723398610         CrispyMiner   

                                            comments        keyword  
0  [{'id': 'lhxbug8', 'body': '![gif](giphy|1KHBP...  Olympics 2024  
1  [{'id': 'lfihb2a', 'body': '**Let's make a dif...  Olympics 2024  
2  [{'id': 'lfcqn9y', 'body': '...Henry Zebrowski...  Olympics 2024  
3  [{'id':

In [22]:
def process_text(text):
        """
        Perform the processing.
        @param text: the text (tweet) to process

        @returns: list of (valid) tokens in text
        """

        text = text.lower()
        tokenizer = nltk.tokenize.TweetTokenizer()
        ltokens = tokenizer.tokenize(text)

        ltokens = [tok.strip() for tok in ltokens]

        # stemmer (we use set to remove duplicates)
        stemmer =  WordNetLemmatizer()
        lStemmedTokens = [stemmer.lemmatize(tok, pos = 'v') for tok in ltokens]
    
        lPunct = list(string.punctuation)

        # use stopwords from nltk and a few other twitter specific terms like 'rt' (retweet)
        lStopwords = nltk.corpus.stopwords.words('english') + lPunct + ['rt', " ","➡",'via','@','#', '...','..', '…', "sure",'"', "'", '`','.',"’",
                                                                '-',"it'","i'm","i've"]

        # pattern for digits
        # the list comprehension in return statement essentially remove all strings of digits or fractions, e.g., 6.15
        regexDigit = re.compile(r"^\d+\s|\s\d+\s|\s\d+$")
        # regex pattern for http
        regexHttp = re.compile("^http")

        return [tok for tok in lStemmedTokens if tok not in lStopwords and regexDigit.match(tok) == None and regexHttp.match(tok) == None]

In [23]:
# Apply preprocessing to the 'title and cooments' column
reddit_df['processed_title'] = reddit_df['title'].apply(process_text)
reddit_df['processed_selftext'] = reddit_df['selftext'].apply(process_text)

# Extract comment bodies into a list
reddit_df['comment_bodies'] = reddit_df['comments'].apply(lambda comments: [comment['body'] for comment in comments if 'body' in comment])

# Process each comment in the comment_bodies
reddit_df['processed_comments'] = reddit_df['comment_bodies'].apply(lambda comments: [process_text(comment) for comment in comments])

In [24]:
# all contents of text
reddit_df['content'] = reddit_df['processed_comments']+ reddit_df['processed_selftext']+ reddit_df['processed_title']

# Flatten the lists in 'content' into a clumn
# reddit_df['processed_content'] = reddit_df['content'].apply(lambda x: [item for sublist in x for item in sublist] if isinstance(x, list) else [])
reddit_df['processed_content'] = reddit_df['content'].apply(
    lambda x: ' '.join([item for sublist in x for item in sublist]) if isinstance(x, list) else '')

In [25]:
print(reddit_df['processed_content'].head(1))

0    gif giphy 1khbpmeokv0b2 pov top right corner r...
Name: processed_content, dtype: object


In [26]:

# Create a mapping of post_id to community_id
post_community = {node: community_id for node, community_id in partition.items() if node in reddit_df['title'].values}

# Add 'community_id' to the DataFrame
reddit_df['community_id'] = reddit_df['title'].map(post_community)


# Verify assignment
print(reddit_df[ 'community_id'].unique())

[ nan   0.  31.  70.  18.  13.   8.  36. 115.  54.  39.  72.   4.  10.
 124. 134.  26.  20. 143. 144.   3. 150. 151. 152.  14. 155. 156.  23.
  38.  12.  15. 163. 164. 125.  55. 158.  28.   7.  57.  52. 166.   9.
 167.  43. 168.  44.  83.  68. 112.  32.   5. 169. 170. 171. 173. 101.
   1. 117.  87. 131. 108. 120.  48. 174.  30.  21. 104. 147.  17. 175.
  46.  42.  34.  61.  62. 176. 177. 178. 111. 107. 179. 103.  40.  74.
 106. 126. 142. 135.  11.  64.  76.   2.  82. 109.  53.  16. 138. 139.
 146. 121.  19.   6. 141. 140.  41.  45.  63.  93.  90. 102.  97.  50.
  80.  29. 116.  77.  49.  75. 129. 165.  56. 110.  22.  86. 130.  71.
 148.  67. 132.  60.  84. 123. 128.  98.  27. 100.  88.  24. 133. 172.
 119. 122.  59.  79.  65.  99.  91.  92.  89.  47. 137.  96.  95.  51.
 113. 153. 162.  33. 160.  35.  66.  78. 136. 161. 114.  94. 159. 118.
  25.  85. 157. 127.  37.  81.  73.  58. 154. 149.  69. 145. 105.]


In [61]:
# Extract Community Topics Using LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Function to get all posts in a community and join them into a single string
def get_community_posts(df, community_id):
    # Join lists into a single string
    return df[df['community_id'] == community_id]['processed_content']

# Initialize a dictionary to store topics for each community
community_topics = {}

# Iterate through each community
for community_id in [154.0, 11.0, 95.0]:
    posts = get_community_posts(reddit_df, community_id)
    if len(posts) <= 2:
        # Skip communities with too few posts for meaningful topic modeling
        continue
    
    # Vectorize the text
    Vectorizer = CountVectorizer(max_df=0.95, min_df=1, max_features=1500, stop_words='english')
    tfidf = Vectorizer.fit_transform(posts)

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=1, random_state=42)
    lda.fit(tfidf)
    
    # Get the top words for the topic
    feature_names = Vectorizer.get_feature_names_out()
    topic_words = [feature_names[i] for i in lda.components_[0].argsort()[:-11:-1]]
    
    community_topics[community_id] = topic_words

# Display the top words for each community
for community_id, words in community_topics.items():
    print(f"Community {community_id}: {', '.join(words)}")


Community 154.0: team, esports, golf, lol, chess, physical, muscle, lot, shoot, sc2
Community 95.0: rio, chicago, delete, brazil, think, olympic, city, world, live, sport


In [28]:
# print("Author centrality Olympics\n:", author_centrality_2024.mean().to_string())
# print("Author centrality Olympics\n:", author_centrality_2020.mean().to_string())
# print("Author centrality Olympics\n:", author_centrality_2016.mean().to_string())

all_author_centrality_means = pd.concat([author_centrality_2016.mean(), author_centrality_2020.mean(), author_centrality_2024.mean()], axis=1)
all_author_centrality_means = all_author_centrality_means.rename(columns={0: 'Olympics 2016', 1: 'Olympics 2020', 2: 'Olympics 2024'})

all_post_centrality_means = pd.concat([post_centrality_2016.mean(), post_centrality_2020.mean(), post_centrality_2024.mean()], axis=1)
all_post_centrality_means = all_post_centrality_means.rename(columns={0: 'Olympics 2016', 1: 'Olympics 2020', 2: 'Olympics 2024'})

print('For author centrality:')
print(all_author_centrality_means, '\n')
print('For post centrality:')
print(all_post_centrality_means)

For author centrality:
                        Olympics 2016  Olympics 2020  Olympics 2024
in_degree_centrality         0.002804       0.002468       0.001379
degree_centrality            0.005608       0.004936       0.002757
betweenness_centrality       0.001720       0.001613       0.000862
closeness_centrality         0.276423       0.273999       0.266310
eigenvector_centrality       0.014811       0.013661       0.007656 

For post centrality:
                        Olympics 2016  Olympics 2020  Olympics 2024
in_degree_centrality         0.002462       0.002084       0.001306
degree_centrality            0.004924       0.004168       0.002612
betweenness_centrality       0.001182       0.001037       0.000669
closeness_centrality         0.271477       0.269677       0.263719
eigenvector_centrality       0.007183       0.006598       0.004325


In [29]:
author_centrality_2016.sort_values('in_degree_centrality', ascending = False)

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
,0.123663,0.247326,0.767496,0.531439,0.566646
snakes_on_a_planet,0.073529,0.147059,0.143529,0.365949,0.142299
IdRatherBeLurking,0.066176,0.132353,0.031848,0.370940,0.188159
BismackMyBiyombo,0.053476,0.106952,0.069371,0.365681,0.153233
PMMeYourDairyPillows,0.052139,0.104278,0.072967,0.370022,0.131788
...,...,...,...,...,...
baccus83,0.001337,0.002674,0.000208,0.269550,0.009399
acatisnotahome,0.001337,0.002674,0.000000,0.265672,0.007628
IrrelevantLeprechaun,0.001337,0.002674,0.000000,0.267429,0.008492
Starkf_,0.000668,0.001337,0.000000,0.347100,0.035484


In [30]:
author_centrality_2020.sort_values('in_degree_centrality', ascending = False)

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
,0.113120,0.226239,0.765705,0.529648,0.580527
nick168,0.062974,0.125948,0.097742,0.365984,0.154471
puppuli,0.059475,0.118950,0.027404,0.361738,0.129399
urfaselol,0.055977,0.111953,0.121466,0.364816,0.135975
Stepside79,0.039650,0.079300,0.018825,0.356105,0.086093
...,...,...,...,...,...
KaiserCanton,0.001166,0.002332,0.000000,0.260401,0.005639
anothergaijin,0.001166,0.002332,0.000277,0.261992,0.006078
Clairounettedu69,0.000583,0.001166,0.000000,0.346325,0.035554
Azura7,0.000583,0.001166,0.000000,0.346325,0.035554


In [31]:
author_centrality_2024.sort_values('in_degree_centrality', ascending = False)

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
padfoony,0.095983,0.191966,0.105534,0.367976,0.487279
puppuli,0.088316,0.176633,0.156080,0.364113,0.220534
sewsgup,0.056424,0.112849,0.031507,0.356316,0.066345
sunbaybrew,0.056118,0.112236,0.059085,0.352769,0.251174
fijozico,0.055504,0.111009,0.018281,0.360093,0.291818
...,...,...,...,...,...
Ok_Switch_1205,0.000613,0.001227,0.000000,0.263132,0.002396
Retal1ate,0.000613,0.001227,0.000000,0.260214,0.001491
austinaaaaa199955,0.000613,0.001227,0.000000,0.259468,0.001521
GimerStick,0.000613,0.001227,0.000000,0.260630,0.001722


In [32]:
author_centrality_all.sort_values('in_degree_centrality', ascending = False).head(6)

Unnamed: 0,in_degree_centrality,degree_centrality,betweenness_centrality,closeness_centrality,eigenvector_centrality
,0.057489,0.114978,0.765545,0.51448,0.528182
puppuli,0.048216,0.096433,0.026677,0.354719,0.217911
padfoony,0.043744,0.087488,0.057313,0.352374,0.214673
sewsgup,0.028908,0.057816,0.052444,0.348198,0.084362
sunbaybrew,0.023345,0.046689,0.008976,0.345234,0.108247
nick168,0.022254,0.044507,0.010068,0.347854,0.064629


## Independent Cascade for Most Active User in Reddit

In [40]:
def independentCascade(graph, trialNum, lSeed, activationProb):
    """
    Performs independent cascade over the input graph.  Results are stored in two output
    lists.

    @param graph: Input graph to perform cascade over.
    @param trialNum: The number of runs/trials to run.  The results are averaged over the
                    the trials/runs.
    @param lSeed: List of initial nodes to seed.  Range from 0 to number of nodes -1.
    @param activationProb: Activation probability on each edge.  All edges have the same
                    activation probability.

    @return: Two lists, lAvgActivationsPerNode and lAvgActivationsPerIteration.
            lAvgActivationsPerNode is a list with the size same as the number of nodes in
            the graph.  Each index of the list (starting with zero) corresponds directly
            to the associated node, and each entry represents the average number of activations
            over the trials/runs, and should lie in [0,1] range.
            lAvgActivationsPerIteration is a list with the size same as the number of trials/runs.
            Each index of the list corresponds to a trial/run, and each entry is the
            total number of active nodes in that trial/run.
    """

    # generate initial lists/vectors for the two output lists
    lAvgActivationsPerNode = [0 for x in range(nx.number_of_nodes(graph))]
    lAvgActivationsPerIteration = []

    # Map node IDs to list indices
    node_to_index = {node: idx for idx, node in enumerate(graph.nodes())}

    print('starting cascade run')
    # loop through the runs/trials
    for i in range(trialNum):
        print('Trial/run no. {}'.format(i))

        #
        # TODO: complete implemention
        #
        
        # list of active nodes
        setActive = set(lSeed)
        setLastActive = set(lSeed)
        setNewActive = set()
        # we keep looping until no more new activations
        while len(setLastActive) > 0:
            # for each active node, we try to influence its (unactived neighbours)
            for currNode in setLastActive:
                # check each neighbour
                for neighbour in graph.neighbors(currNode):
                    # we only want non-active neighbours
                    if neighbour not in setActive and neighbour not in setNewActive:
                        if random.random() < activationProb:
                            setNewActive.add(neighbour)

            # update last active
            setLastActive = setNewActive
            # extend active set
            setActive.update(setNewActive)
            # reset new active
            setNewActive = set()

        # update the output lists
        for x in setActive:
            lAvgActivationsPerNode[node_to_index[x]] += 1

        # update with total number of activations
        lAvgActivationsPerIteration.append(len(setActive))

    # placeholder, replace with appropriate returns (if necessary)
    # we average each entry in lAvgActivationsPerNode by number of runs/trials
    return [float(count) / trialNum for count in lAvgActivationsPerNode], lAvgActivationsPerIteration


In [47]:
# fileName
sFilenameSuffix = f'{author_name}_independent_cascade.graphml'

# tree graph
treeGraph = nx.read_graphml('puppuli_community_reddit.graphml')

# small world graph
smallWorldGraph = nx.read_graphml('puppuli_community_reddit.graphml')

#
# Independent cascade
#
lSeed = list(treeGraph.nodes())[:2]  # Get first two nodes as seeds
trialNum = 10
activationProb = 0.5

#
# independent cascade on tree graph

if treeGraph != None:
    lAvgActivationsPerNode, lAvgActivationsPerIteration = independentCascade(treeGraph, trialNum, lSeed, activationProb)
    print(lAvgActivationsPerNode)
    print(lAvgActivationsPerIteration)
    print('Average number of nodes activated = {} out of {}'.format(sum(lAvgActivationsPerIteration) / len(lAvgActivationsPerIteration), nx.number_of_nodes(treeGraph)))


    # Save to graph
    # average activation per node for balanced tree,
    # stored in node attribute 'avgAct'
    # Save the average activation per node for balanced tree
    # use zip and nodes since the node ids are not numbers
    for node, avgActivation in zip(treeGraph.nodes(), lAvgActivationsPerNode):
        treeGraph.nodes[node]['avgAct'] = avgActivation


    # Output modified graphs to respective files
    nx.readwrite.write_graphml(treeGraph, 'tree_' + sFilenameSuffix, infer_numeric_types=True)

#
# small world graph
#

if smallWorldGraph != None:
    lAvgActivationsPerNode, lAvgActivationsPerIteration = independentCascade(smallWorldGraph, trialNum, lSeed, activationProb)
    print(lAvgActivationsPerNode)
    print(lAvgActivationsPerIteration)
    print('Average number of nodes activated = {} out of {}'.format(sum(lAvgActivationsPerIteration) / len(lAvgActivationsPerIteration), nx.number_of_nodes(smallWorldGraph)))


    # average activation per node for small world graph,
    # stored in node attribute 'avgAct'
    # use zip and nodes since the node ids are not numbers
    for node, avgActivation in zip(smallWorldGraph.nodes(), lAvgActivationsPerNode):
        smallWorldGraph.nodes[node]['avgAct'] = avgActivation

    # Output modified graphs to respective files
    nx.readwrite.write_graphml(smallWorldGraph, 'smallWorld_' + sFilenameSuffix, infer_numeric_types=True)


starting cascade run
Trial/run no. 0
Trial/run no. 1
Trial/run no. 2
Trial/run no. 3
Trial/run no. 4
Trial/run no. 5
Trial/run no. 6
Trial/run no. 7
Trial/run no. 8
Trial/run no. 9
[1.0, 1.0, 0.2, 0.2, 0.4, 0.4, 0.3, 0.3, 0.2, 0.4, 0.4, 0.3, 0.6, 0.3, 0.3, 0.4, 0.5, 0.1, 0.5, 0.5, 0.1, 0.5, 0.2, 0.2, 0.5, 0.5, 0.3, 0.4, 0.3, 0.4, 0.4, 0.5, 0.3, 0.6, 0.6, 0.4, 0.6, 0.2, 0.3, 0.5, 0.1, 0.6, 0.5, 0.5, 0.3, 0.4, 0.3, 0.5, 0.3, 0.5, 0.7, 0.2, 0.4, 0.3, 0.5, 0.6, 0.3, 0.4, 0.4, 0.3, 0.6, 0.2, 0.5, 0.2, 0.3, 0.5, 0.3, 0.2, 0.5, 0.5, 0.4, 0.2, 0.6, 0.4, 0.5, 0.5, 0.5, 0.4, 0.3, 0.2, 0.5, 0.3, 0.4, 0.4, 0.3, 0.5, 0.3, 0.5, 0.6, 0.4, 0.6, 0.3, 0.4, 0.4, 0.5, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.6, 0.4, 0.4, 0.4, 0.4, 0.5, 0.2, 0.5, 0.3, 0.5, 0.5, 0.4, 0.6, 0.1, 0.4, 0.3, 0.3, 0.4, 0.4, 0.3, 0.4, 0.3, 0.6, 0.4, 0.3, 0.6, 0.5, 0.6, 0.2, 0.5, 0.4, 0.4, 0.5, 0.1, 0.4, 0.6, 0.3, 0.7, 0.2, 0.3, 0.3, 0.5, 0.4, 0.3, 0.4, 0.5, 0.4, 0.4, 0.2, 0.7, 0.5, 0.4, 0.5, 0.2, 0.2, 0.6, 0.3, 0.4, 0.6, 0.3, 0.5, 0.4

## Linear Threshold Influence Approach

In [48]:
def linearThreshold(graph, trialNum, lSeed):
    """
    Performs linear threshold model over the input directed graph.  Results are stored in two output
    lists.

    @param graph: Input graph to perform the LT model over.
    @param trialNum: The number of runs/trials to run.  The results are averaged over the
                    the trials/runs.
    @param lSeed: List of initial nodes to seed.  Range from 0 to number of nodes -1.

    @return: Two lists, lAvgActivationsPerNode and lAvgActivationsPerIteration.
            lAvgActivationsPerNode is a list with the size same as the number of nodes in
            the graph.  Each index of the list (starting with zero) corresponds directly
            to the associated node, and each entry represents the average number of activations
            over the trials/runs, and should lie in [0,1] range.
            lAvgActivationsPerIteration is a list with the size same as the number of trials/runs.
            Each index of the list corresponds to a trial/run, and each entry is the
            total number of active nodes in that trial/run.
    """

    # generate initial lists/vectors for the two output lists
    lAvgActivationsPerNode = {node: 0 for node in graph.nodes()}
    lAvgActivationsPerIteration = []

    # Map node IDs to list indices
    node_to_index = {node: idx for idx, node in enumerate(graph.nodes())}

    print('starting linear threshold runs')
    # loop through the runs/trials
    for i in range(trialNum):
        print('Trial/run no. {}'.format(i))

        # for each node, generate the random thresholds
        for currNode, attr in graph.nodes(data=True):
            attr['threshold'] = random.random()

        # list of active nodes
        setActive = set(lSeed)
        setLastActive = set(lSeed)
        setNewActive = set()
        # we keep looping until no more new activations
        while len(setLastActive) > 0:
            # we get all the nodes next to the current set of active nodes
            neighbourSet = set()
            for activeNode in setLastActive:
                neighbourSet.update([neighbour for neighbour in graph.successors(activeNode) if neighbour not in setActive and neighbour not in setNewActive])

            # for each of these potential neighbours to be activated, test if it will be activated
            for neighbour in neighbourSet:
                try:
                    # get the sum of weights
                    weightTotal = sum([dataDict['weight'] for (u,v, dataDict) in graph.in_edges(neighbour, data=True)])
                    # test against the node threshold
                    if graph.nodes[neighbour]['threshold'] < weightTotal:
                        setNewActive.add(neighbour)
                except KeyError as e:
                    print("Key error: {} is missing for edge".format(e, (u,v)))

            # update last active
            setLastActive = setNewActive
            # extend active set
            setActive.update(setNewActive)
            # reset new active
            setNewActive = set()

        # update the output lists
        for x in setActive:
            lAvgActivationsPerNode[x] += 1
            
        # update with total number of activations
        lAvgActivationsPerIteration.append(len(setActive))

    # we average each entry in lAvgActivationsPerNode by number of runs/trials
    # Return the dictionary instead of the list
    return {node: float(count) / trialNum for node, count in lAvgActivationsPerNode.items()}, lAvgActivationsPerIteration


In [49]:
def generateWeights(graph):
    """
    Generate weights for the edges.

    @param graph: directed graph to generate weights on the edges.
    @return: modified directed graph with weights on edges, under attribute 'weight'
    """

    for currNode in graph.nodes():
        # generate the number that the weights should sum up to
        totalWeight = random.random()
        # use dirichlet distribution to generate the weights
        aWeights = np.random.dirichlet(np.ones(graph.in_degree(currNode)), size=1) * totalWeight
        lWeights = aWeights[0].tolist()

        for i,u in enumerate(graph.predecessors(currNode)):
            graph.add_edge(u,currNode,weight=lWeights[i])

In [50]:
# fileName
sFilenameSuffix = f'{author_name}_linear_threshold.graphml'

# tree graph
undirectedTreeGraph = nx.read_graphml('puppuli_community_reddit.graphml')
treeGraph = undirectedTreeGraph.to_directed()
generateWeights(treeGraph)


# small world graph
undirectedSmallWorldGraph = nx.read_graphml('puppuli_community_reddit.graphml')
smallWorldGraph = undirectedSmallWorldGraph.to_directed()
generateWeights(smallWorldGraph)


#
# Linear threshold
#
seedNum = 3
lSeed = list(treeGraph.nodes())[:2]  # Get first two nodes as seeds
trialNum = 10


#
# TODO: complete the implementation of the linear threshold model with function
# linearThreshold()
#

#
# tree graph
#

if treeGraph != None:
    lAvgActivationsPerNode, lAvgActivationsPerIteration = linearThreshold(treeGraph, trialNum, lSeed)
    print(lAvgActivationsPerNode)
    print(lAvgActivationsPerIteration)
    if len(lAvgActivationsPerIteration) > 0:
        print('Average number of nodes activated = {} out of {}'.format(sum(lAvgActivationsPerIteration) / len(lAvgActivationsPerIteration), nx.number_of_nodes(treeGraph)))
    else:
        print('Average number of nodes activated = {} out of {}'.format(0, nx.number_of_nodes(treeGraph)))


    # average activation per node for small world graph,
    # stored in node attribute 'avgAct'
    for nodeId, avgActivation in lAvgActivationsPerNode.items():
        treeGraph.nodes[nodeId]['avgAct'] = avgActivation

    # Output modified graphs to respective files
    nx.readwrite.write_graphml(treeGraph, 'treeLT_' + sFilenameSuffix, infer_numeric_types=True)

#
# small world graph
#

if smallWorldGraph != None:
    lAvgActivationsPerNode, lAvgActivationsPerIteration = linearThreshold(smallWorldGraph, trialNum, lSeed)
    print(lAvgActivationsPerNode)
    print(lAvgActivationsPerIteration)
    if len(lAvgActivationsPerIteration) > 0:
        print('Average number of nodes activated = {} out of {}'.format(sum(lAvgActivationsPerIteration) / len(lAvgActivationsPerIteration), nx.number_of_nodes(smallWorldGraph)))
    else:
        print('Average number of nodes activated = {} out of {}'.format(0, nx.number_of_nodes(smallWorldGraph)))


    # average activation per node for small world graph,
    # stored in node attribute 'avgAct'
    for nodeId, avgActivation in lAvgActivationsPerNode.items():
        smallWorldGraph.nodes[nodeId]['avgAct'] = avgActivation

    # Output modified graphs to respective files
    nx.readwrite.write_graphml(smallWorldGraph, 'smallWorldLT_' + sFilenameSuffix, infer_numeric_types=True)


starting linear threshold runs
Trial/run no. 0
Trial/run no. 1
Trial/run no. 2
Trial/run no. 3
Trial/run no. 4
Trial/run no. 5
Trial/run no. 6
Trial/run no. 7
Trial/run no. 8
Trial/run no. 9
[1139, 1164, 1175, 1132, 1125, 2, 1126, 1198, 1138, 1155]
Average number of nodes activated = 1035.4 out of 2287
starting linear threshold runs
Trial/run no. 0
Trial/run no. 1
Trial/run no. 2
Trial/run no. 3
Trial/run no. 4
Trial/run no. 5
Trial/run no. 6
Trial/run no. 7
Trial/run no. 8
Trial/run no. 9
[1146, 1173, 1172, 1141, 1128, 1147, 1127, 1135, 1136, 1139]
Average number of nodes activated = 1144.4 out of 2287
