In [22]:
import numpy as np
from scipy.sparse import csr_matrix
import igraph as ig
import pandas as pd
import leidenalg
from math import log as logg
import matplotlib.pyplot as plt

def extract_ne(comments, n_min=3, n_max=100, tform=True, equalik=False):
    """
    Extracts nodes and edges from a list of cleaned comment strings.
    
    Parameters:
      comments : list of str
          Each string is one comment.
      n_min : int
          Minimum number of times a word must appear to be kept.
      n_max : int
          Words appearing at or above this frequency are removed.
      tform : bool
          If True, applies a TF-IDF–like transformation.
      equalik : bool
          If True, normalizes so that each document is equally weighted.
    
    Returns:
      nodes : list of dict
          Each dict contains a node id and the original document index.
      edges : list of dict
          Each dict contains source, target, and the weight of the edge.
      G : igraph.Graph
          The constructed graph.
    """
    
    # 1. Tokenize each comment (if not already tokenized)
    # Here we assume each comment is a clean string.
    pos_clean = [comment.split() for comment in comments]
    
    # 2. Build the list of unique words and a dictionary mapping word -> index.
    words = np.unique([word for tokens in pos_clean for word in tokens])
    words_dict = {word: i for i, word in enumerate(words)}
    
    num_words = len(words)
    num_docs = len(pos_clean)
    
    # 3. Build the word–document occurrence matrix (Mwd).
    #    Rows correspond to words; columns correspond to documents.
    Mwd = np.zeros((num_words, num_docs), dtype=np.int8)
    for i, tokens in enumerate(pos_clean):
        for token in tokens:
            # (if the dataset is already clean all tokens should be in words_dict)
            if token in words_dict:
                Mwd[words_dict[token], i] += 1
    
    # 4. Filter words: remove words that occur too few or too many times.
    word_counts = Mwd.sum(axis=1)
    valid_word_mask = (word_counts >= n_min) & (word_counts < n_max)
    removed_words = words[~valid_word_mask]
    if removed_words.size > 0:
        print("Removing words:", removed_words)
    
    Mwd = Mwd[valid_word_mask, :]  # keep only selected words
    words = words[valid_word_mask]
    
    # 5. Remove documents that (after word filtering) have no words.
    doc_counts = Mwd.sum(axis=0)
    valid_doc_mask = doc_counts > 0
    Mwd = Mwd[:, valid_doc_mask]
    # Keep track of the original document indices
    documents = np.arange(num_docs)[valid_doc_mask]
    
    # 6. Create a probability matrix Pwd.
    #    (You can think of this as normalizing word frequencies.)
    if equalik:
        # Each document gets equal weight.
        col_sums = Mwd.sum(axis=0)
        # Avoid division by zero (should not happen after filtering)
        col_sums[col_sums == 0] = 1
        Pwd = Mwd / col_sums / Mwd.shape[1]
    else:
        # Weight documents proportionally to their length.
        total_sum = Mwd.sum()
        Pwd = Mwd / total_sum
    
    # 7. (Optional) Apply a TF-IDF–like weighting.
    if tform:
        num_docs_current = Mwd.shape[1]
        # Compute in how many documents each word appears.
        doc_freq = np.sum(Mwd > 0, axis=1)
        # Avoid log(0)
        doc_freq[doc_freq == 0] = 1
        idf = -np.log(doc_freq / num_docs_current)
        # Multiply each word’s row by its idf weight.
        Pwd = np.diag(idf) @ Pwd
        # Normalize the entire matrix
        Pwd = Pwd / Pwd.sum()
    
    # 8. Compute a document–document (co-occurrence/similarity) matrix.
    #    We use the following formula:
    #         Pdd = (Pwd.T / pw) @ Pwd
    #    where pw is the vector of (weighted) word frequencies.
    pw = Pwd.sum(axis=1)
    # Avoid division by zero for any word that might have 0 total weight.
    pw = np.where(pw == 0, 1, pw)
    # The broadcasting division divides each column of Pwd.T by pw.
    Pdd = (Pwd.T / pw[np.newaxis, :]) @ Pwd  # Shape: (num_docs, num_docs)
    
    # 9. Build a graph from Pdd.
    #    We first create a binary adjacency matrix (an edge exists if the similarity is > 0)
    A = csr_matrix(Pdd)
    adjacency = (A > 0).toarray().tolist()
    
    # Create an undirected graph from the adjacency matrix.
    G = ig.Graph.Adjacency(adjacency, mode=ig.ADJ_UNDIRECTED)
    
    # Now assign the actual edge weights (from Pdd).
    # (Note: The order in which igraph assigns edges corresponds to the order of nonzero entries.)
    nonzero_indices = A.nonzero()
    weights = A[nonzero_indices].A1  # flatten the array
    G.es['weight'] = weights.tolist()
    
    # 10. (Optional) Save the original document id with each node.
    G.vs['doc_id'] = documents.tolist()
    
    # 11. Extract nodes and edges as lists of dictionaries.
    nodes = [{'id': v.index, 'doc_id': v['doc_id']} for v in G.vs]
    edges = []
    for e in G.es:
        source, target = e.tuple
        edges.append({
            'source': source,
            'target': target,
            'weight': e['weight']
        })
    
    return nodes, edges, G


In [23]:
bm = pd.read_csv('BeforeMilton.csv')
comments_bm = bm['0'].tolist()

am = pd.read_csv('AfterMilton.csv')
comments_am = am['0'].tolist()

cc = pd.read_csv('CC_comments.csv')
comments_cc = cc['0'].tolist()

In [24]:
nodes, edges, graph = extract_ne(comments_bm, n_min=10, n_max=100, tform=True)

print("Nodes:")
for node in nodes:
    print(node)
    
print("\nEdges:")
for edge in edges:
    print(edge)

pd.DataFrame(nodes).to_csv('nodes_bm.csv', index=False)
pd.DataFrame(edges).to_csv('edges_bm.csv', index=False)

Removing words: ['!!' '!!!!' '"fuq' '&' '(flag)' '-' '1' '1.-' '100' '100+' '113' '2'
 '24hrs' '30mph' '4' '8' ':/' 'AZ' 'Age' 'All' 'Are' 'BULLETPROOF??'
 'Biden' 'Bill' 'Bro' 'CEO' 'Chicago' 'City' 'DO' 'Didn’t' 'EA' 'East'
 'Erie,' 'Erie?' 'Every' 'FEMA' 'Free' 'GA' 'GREECE' 'Global' 'Glory'
 'Gogh' 'Government' 'HAVE' 'Helene' 'Holy' "I'm" 'INCULS' 'If' 'It'
 'It’s' 'I’m' 'Jerr' 'KY,' 'Lake' 'Like' 'MENTIONED' 'Mandela' 'Meanwhile'
 'My' 'NC,' 'NC.' 'NG' 'NOT' 'Nelly' 'Never' 'New' 'No' 'Number' 'PR'
 'Pennsylvania???' 'Phoenix' 'Pijos' 'Public' 'RAAHHHHH' 'SC,' 'SO'
 'Saves' 'Sea' 'See' 'So' 'THAT' 'TN,' 'TO' 'Texas' 'The' 'There’s' 'They'
 'This' 'UN' 'US' 'US.' 'Ukraine' 'Used' 'VA' 'Van' 'Vatican' 'Well'
 'What' 'Where' 'Whoever' 'Why' 'You' 'about' 'absolutely' 'accept'
 'added,' 'admin' 'affecting' 'after' 'air' 'all' 'allowed' 'almost'
 'alphabet' 'already' 'amount' 'animal' 'annual' 'any' 'anything' 'apart'
 'are' 'areas' 'art' 'as' 'asking,' 'at' 'be' 'because' 'beck' 'bec

In [25]:
nodes, edges, graph = extract_ne(comments_am, n_min=10, n_max=100, tform=True)

print("Nodes:")
for node in nodes:
    print(node)
    
print("\nEdges:")
for edge in edges:
    print(edge)

pd.DataFrame(nodes).to_csv('nodes_am.csv', index=False)
pd.DataFrame(edges).to_csv('edges_am.csv', index=False)

Removing words: ['!!!' '!!!!!!!!' '!?' ... '🥺🥺' '🥺🥺🥺' '🥺🥺🥺🥺🥺🥺🥺🥺']
Nodes:
{'id': 0, 'doc_id': 1}
{'id': 1, 'doc_id': 2}
{'id': 2, 'doc_id': 3}
{'id': 3, 'doc_id': 4}
{'id': 4, 'doc_id': 5}
{'id': 5, 'doc_id': 6}
{'id': 6, 'doc_id': 8}
{'id': 7, 'doc_id': 9}
{'id': 8, 'doc_id': 10}
{'id': 9, 'doc_id': 11}
{'id': 10, 'doc_id': 12}
{'id': 11, 'doc_id': 13}
{'id': 12, 'doc_id': 14}
{'id': 13, 'doc_id': 15}
{'id': 14, 'doc_id': 16}
{'id': 15, 'doc_id': 17}
{'id': 16, 'doc_id': 18}
{'id': 17, 'doc_id': 19}
{'id': 18, 'doc_id': 20}
{'id': 19, 'doc_id': 21}
{'id': 20, 'doc_id': 22}
{'id': 21, 'doc_id': 23}
{'id': 22, 'doc_id': 24}
{'id': 23, 'doc_id': 25}
{'id': 24, 'doc_id': 26}
{'id': 25, 'doc_id': 27}
{'id': 26, 'doc_id': 28}
{'id': 27, 'doc_id': 29}
{'id': 28, 'doc_id': 30}
{'id': 29, 'doc_id': 31}
{'id': 30, 'doc_id': 32}
{'id': 31, 'doc_id': 33}
{'id': 32, 'doc_id': 34}
{'id': 33, 'doc_id': 35}
{'id': 34, 'doc_id': 36}
{'id': 35, 'doc_id': 37}
{'id': 36, 'doc_id': 38}
{'id': 37, 'doc_id':

In [27]:
nodes, edges, graph = extract_ne(str(comments_cc), n_min=10, n_max=100, tform=True)

print("Nodes:")
for node in nodes:
    print(node)
    
print("\nEdges:")
for edge in edges:
    print(edge)

pd.DataFrame(nodes).to_csv('nodes_cc.csv', index=False)
pd.DataFrame(edges).to_csv('edges_cc.csv', index=False)

Removing words: ['!' '#' '$' '&' "'" '(' '*' '+' ',' '/' '0' '6' '7' '8' ':' '=' '@' 'A'
 'B' 'C' 'D' 'E' 'G' 'H' 'I' 'K' 'L' 'M' 'N' 'O' 'P' 'R' 'S' 'T' 'U' '['
 ']' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q'
 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '~' '¯' '°' '²' '³' '¾' 'Á' 'È' 'É'
 'Ê' 'Ñ' 'à' 'á' 'ã' 'é' 'í' 'ñ' 'ó' 'ô' 'Ž' 'ء' 'ا' 'ب' 'ت' 'ح' 'د' 'ر'
 'س' 'ص' 'ع' 'ف' 'ق' 'ك' 'م' 'ن' 'ه' 'و' 'ى' 'ي' 'ہ' 'ی' '‘' '’' '⁉' '⁵'
 '☀' '☝' '☠' '☹' '✅' '✊' '✋' '✌' '✔' '✝' '⬇' 'ツ' '️' '🇦' '🇧' '🇨' '🇩' '🇪'
 '🇫' '🇮' '🇰' '🇱' '🇲' '🇵' '🇸' '🇺' '🌀' '🌄' '🌈' '🌎' '🌞' '🌠' '🌧' '🌱' '🍀' '🍉'
 '🍿' '🎄' '🎅' '🎉' '🎺' '🏃' '🏊' '🏏' '🏡' '🏼' '🏽' '🏾' '🏿' '👀' '👁' '👄' '👇' '👈'
 '👊' '👍' '👑' '👱' '👹' '💅' '💓' '💕' '💖' '💗' '💙' '💚' '💜' '💝' '💧' '💨' '💫' '💰'
 '📉' '🔄' '🔛' '🔝' '🕊' '🕴' '🗣' '🗽' '🗿' '😀' '😃' '😄' '😆' '😇' '😈' '😉' '😊' '😋'
 '😌' '😎' '😐' '😒' '😓' '😕' '😖' '😗' '😜' '😟' '😤' '😥' '😧' '😪' '😫' '😭' '😮' '😰'
 '😱' '😵' '😶' '😹' '🙀' '🙁' '🙂' '🙃' '🙅' '🙆' '🙈' '🙏' '🚨' '🤌' '🤍' '🤐' '🤔' '🤕'
 '🤗' '🤚' '🤜' '🤦' '🤨' '🤩' '🤪' '🤬' '🤯