In [1]:
import pandas as pd
import math
import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import re
import string
import networkx as nx
import community as community_louvain  # Install python-louvain package
from Seb_Folder.Louvain import NewsData, calculate_modularity
from networkx.algorithms.community import louvain_communities, modularity

In [2]:
stock_data = "Stock_prices_2021_to_2024.csv"
reddit_data = "Reddit_2021_to_2024.csv"

# Load the stock data
stock_df = pd.read_csv(stock_data)
# Load the reddit data
reddit_df = pd.read_csv(reddit_data)


# Add the title and body columns together
reddit_df['body'] = reddit_df['title'] + ' ' + reddit_df['body']


reddit = NewsData(reddit_df)


# Step 1: Build LSH and find near-duplicates
reddit.build_lsh(threshold=0.95)  # High threshold to detect near-duplicates
duplicate_groups = reddit.find_duplicates()

# Step 2: Merge near-duplicates
reddit.merge_duplicates(duplicate_groups)


In [3]:
from tqdm import tqdm

# Create the graph
G = nx.Graph()
num_docs = reddit_df.shape[0]
total_comparisons = num_docs * (num_docs - 1) // 2

reddit.build_lsh(threshold=0.4, num_perm=4048)  # Lower threshold for general similarity

# Add edges to the graph based on similarity scores
for i, j in tqdm(reddit.compute_similarity_lsh(), total=total_comparisons, desc="Building Graph"):
    G.add_edge(i, j)

# Perform Louvain community detection
partition = louvain_communities(G, resolution=1)

# Print the community detection results
print("\nLouvain Community Detection Result:")
print(partition)

print("\nModularity:", modularity(G, partition))

Building Graph:   0%|          | 11005/38206911 [00:09<9:36:51, 1103.56it/s]



Louvain Community Detection Result:
[{10, 479}, {2920, 11}, {4865, 6403, 4869, 5135, 4625, 6162, 1042, 21, 1314, 6690, 5922, 4386, 5683, 3380, 2871, 314, 1596, 5949, 5951, 74, 1359, 5200, 1363, 1109, 856, 3422, 4961, 5217, 6244, 5486, 6265, 122, 4095, 3196, 3459, 6532, 5254, 2185, 2958, 4497, 6034, 1427, 2964, 4757, 5784, 3995, 1950, 5538, 5292, 4014, 6651, 5816, 1723, 5307, 1984, 1218, 1735, 2248, 5321, 3786, 5833, 4042, 2001, 6355, 477, 5088, 1505, 6639, 4336, 3313, 498, 755, 1781, 2299, 2045, 254, 3071}, {2976, 28, 205}, {3133, 30}, {6056, 3451, 35}, {44, 5734}, {1362, 47}, {4362, 268, 4494, 5007, 5781, 3480, 3737, 3870, 1440, 548, 5287, 172, 4143, 53, 6461, 6462, 3775, 5184, 1216, 1474, 5823, 4553, 3916, 4047, 977, 5714, 2391, 6233, 6363, 4443, 5086, 5856, 609, 6369, 4449, 6634, 1648, 1776, 3568, 760, 2554}, {2195, 60, 573}, {2528, 2534, 4022, 3592, 76, 2525, 2527}, {5828, 78}, {2338, 83, 3080, 3640, 588, 1279}, {92, 2596}, {95, 2935}, {4608, 1281, 1284, 6406, 262, 4102, 1287, 129

In [4]:
print("\nNumber of Communities:", len(partition))


Number of Communities: 367


In [None]:
# Assign nodes to their corresponding communities
import netwulf as nw
import matplotlib.pyplot as plt
node_to_community = {}
for community_index, community in enumerate(partition):
    for node in community:
        node_to_community[node] = community_index

from random import randint, seed
seed(0)
colors = ['#%06X' % randint(0, 0xFFFFFF) for _ in range(len(partition))]

# Add the community data to each node as an attribute
for node in G.nodes():
    G.nodes[node]['group'] = node_to_community[node]

for n1, n2, data in G.edges(data=True): # set alpha to 0.1 for all edges
    data['weight'] = 0.1    

print("Number of nodes:", G.nodes(data=True))

nw.visualize(G, config={'zoom': 0.1, 'node_fill_color': [colors[data['group']] for _, data in G.nodes(data=True)], 'alpha':0.1, 'collisions': False})

Number of nodes: [(2, {'group': 68}), (5378, {'group': 68}), (325, {'group': 68}), (5389, {'group': 68}), (6643, {'group': 68}), (4216, {'group': 365}), (4, {'group': 156}), (1792, {'group': 156}), (6176, {'group': 156}), (1220, {'group': 156}), (5417, {'group': 156}), (5932, {'group': 156}), (12, {'group': 156}), (1917, {'group': 58}), (3997, {'group': 156}), (2258, {'group': 156}), (210, {'group': 156}), (3606, {'group': 156}), (6327, {'group': 156}), (5789, {'group': 156}), (2942, {'group': 156}), (5, {'group': 40}), (2308, {'group': 40}), (4197, {'group': 40}), (6154, {'group': 40}), (4877, {'group': 365}), (3887, {'group': 40}), (7, {'group': 88}), (929, {'group': 88}), (4547, {'group': 88}), (1683, {'group': 88}), (5662, {'group': 88}), (8, {'group': 68}), (473, {'group': 68}), (4786, {'group': 68}), (9, {'group': 156}), (2305, {'group': 156}), (2244, {'group': 156}), (581, {'group': 184}), (4031, {'group': 156}), (10, {'group': 0}), (479, {'group': 0}), (11, {'group': 1}), (2920

In [None]:

# Show 3347 and 3175 from the dataframe
print(reddit_df.iloc[209])
print(reddit_df.iloc[182])


stock                                                    Apple
timestamp                                  2023-06-05 20:44:00
title        I see investors are real happy about Apple pul...
body         I see investors are real happy about Apple pul...
source                                                  reddit
cleaned      i see investors are real happy about apple pul...
Name: 209, dtype: object
stock                                                    Apple
timestamp                                  2024-08-09 04:50:42
title        How many of you bought the dip and quit wendy’s? 
body         How many of you bought the dip and quit wendy’...
source                                                  reddit
cleaned      how many of you bought the dip and quit wendys...
Name: 182, dtype: object
