<a href="https://colab.research.google.com/github/CashelF/misinformation-susceptibility/blob/main/Community_Alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas networkx sentence-transformers



In [6]:
import pandas as pd
import os
import gc
import glob
import networkx as nx
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence transformer model for embeddings
transformer_model = SentenceTransformer('all-MiniLM-L6-v2')



# Helper Function to Filter Out Deleted Users

In [3]:
def filter_deleted(data):
    """Remove rows where the username is '[deleted]'."""
    return data[data['author'] != "[deleted]"]

# Recursive Alignment Function

In [4]:
def calculate_recursive_alignment(comment_id, comments_dict, parent_alignment):
    """Recursively determine the alignment of a comment based on its parent."""
    # If the comment agrees with the parent, it inherits the parent's alignment
    if comments_dict[comment_id]['sentiment'] == parent_alignment:
        return parent_alignment  # Same alignment as the parent
    else:
        return -parent_alignment  # Opposite alignment


# Process One Subreddit at a Time

In [7]:
def process_subreddit(submission_file, comment_file):
    """Process submissions and comments for a single subreddit and return edges."""
    # Load submissions and comments, skipping deleted users
    submissions = filter_deleted(pd.read_csv(submission_file, dtype={'author_flair_text': str}, low_memory=False))
    comments = filter_deleted(pd.read_csv(comment_file, dtype={'parent_id': str, 'link_id': str}, low_memory=False))

    # Merge submissions and comments on link_id (removing 't3_' prefix from link_id)
    comments['link_id'] = comments['link_id'].str.split('_').str[-1]
    merged_data = pd.merge(comments, submissions, left_on='link_id', right_on='id', suffixes=('_comment', '_submission'))

    # Generate embeddings for submissions and comments
    merged_data['submission_embedding'] = merged_data['title'].apply(
        lambda text: transformer_model.encode(text, convert_to_tensor=True)
    )
    merged_data['comment_embedding'] = merged_data['body'].apply(
        lambda text: transformer_model.encode(text, convert_to_tensor=True)
    )

    # Calculate cosine similarity to determine sentiment (1 = positive, -1 = negative)
    merged_data['similarity'] = merged_data.apply(
        lambda row: util.cos_sim(row['comment_embedding'], row['submission_embedding']).item(),
        axis=1
    )
    merged_data['sentiment'] = merged_data['similarity'].apply(lambda x: 1 if x >= 0 else -1)

    # Create a dictionary of comments with their parent alignment
    comments_dict = merged_data.set_index('id_comment').to_dict('index')

    # Recursively calculate alignment for each comment based on its parent
    for comment_id in comments_dict:
        parent_id = comments_dict[comment_id]['parent_id']
        if parent_id.startswith('t1_'):  # Ensure the parent is a comment
            parent_id = parent_id.split('_')[-1]
            if parent_id in comments_dict:  # Parent exists in the data
                parent_alignment = comments_dict[parent_id]['sentiment']
                comments_dict[comment_id]['alignment_score'] = calculate_recursive_alignment(
                    comment_id, comments_dict, parent_alignment
                )
            else:
                # If parent isn't found, assume sentiment is as-is
                comments_dict[comment_id]['alignment_score'] = comments_dict[comment_id]['sentiment']
        else:
            # If it's a top-level comment, alignment is based on the original post
            comments_dict[comment_id]['alignment_score'] = comments_dict[comment_id]['sentiment']

    # Convert the dictionary back to a DataFrame for processing
    aligned_comments = pd.DataFrame.from_dict(comments_dict, orient='index')

    # Group by user and subreddit to compute cumulative alignment scores
    user_alignment = aligned_comments.groupby(['author_comment', 'subreddit'])['alignment_score'].sum().reset_index()

    # Extract edges between subreddit pairs based on shared users
    edges = []
    for user, group in user_alignment.groupby('author_comment'):
        subreddit_pairs = [(a, b) for idx, a in enumerate(group['subreddit']) for b in group['subreddit'][idx + 1:]]

        for sub1, sub2 in subreddit_pairs:
            align1 = group[group['subreddit'] == sub1]['alignment_score'].values[0]
            align2 = group[group['subreddit'] == sub2]['alignment_score'].values[0]

            # Determine edge weight based on alignment match
            weight = 1 if align1 == align2 else -1
            edges.append((sub1, sub2, weight))

    # Free memory
    del submissions, comments, merged_data, aligned_comments, user_alignment
    gc.collect()

    return edges

# Save and Accumulate Edges Across Subreddits

In [None]:
import glob

# Get the list of submission and comment files
submission_files = glob.glob('/content/drive/MyDrive/Reddit Misinformation Susceptibility/Data_Sets/*_submissions.csv')
comment_files = glob.glob('/content/drive/MyDrive/Reddit Misinformation Susceptibility/Data_Sets/*_comments.csv')

# Process each subreddit and save intermediate edges
for submission_file, comment_file in zip(submission_files, comment_files):
    edges = process_subreddit(submission_file, comment_file)

    # Save edges to a temporary file (one per subreddit)
    with open(f'{submission_file}_edges.txt', 'w') as f:
        for edge in edges:
            f.write(f"{edge[0]},{edge[1]},{edge[2]}\n")

# Build the Final Graph from Saved Edges

In [None]:
# Initialize the final graph
G = nx.Graph()

# Load all saved edges and add them to the graph
for edge_file in glob.glob('path_to_data/*_edges.txt'):
    with open(edge_file, 'r') as f:
        for line in f:
            sub1, sub2, weight = line.strip().split(',')
            weight = int(weight)

            # Add edge to the graph
            if G.has_edge(sub1, sub2):
                G[sub1][sub2]['weight'] += weight
            else:
                G.add_edge(sub1, sub2, weight=weight)

# Save the final graph
nx.write_gexf(G, 'subreddit_alignment_graph.gexf')
