In [1]:
import pandas as pd
import math
import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import re
import string
import networkx as nx
import community as community_louvain  # Install python-louvain package
from Seb_Folder.Louvain import NewsData, calculate_modularity
from networkx.algorithms.community import louvain_communities, modularity

In [2]:
stock_data = "Stock_prices_2021_to_2024.csv"
reddit_data = "Reddit_2021_to_2024.csv"

# Load the stock data
stock_df = pd.read_csv(stock_data)
# Load the reddit data
reddit_df = pd.read_csv(reddit_data)


# Add the title and body columns together
reddit_df['body'] = reddit_df['title'] + ' ' + reddit_df['body']


reddit = NewsData(reddit_df)


# Step 1: Build LSH and find near-duplicates
reddit.build_lsh(threshold=0.95)  # High threshold to detect near-duplicates
duplicate_groups = reddit.find_duplicates()

# Step 2: Merge near-duplicates
reddit.merge_duplicates(duplicate_groups)


In [3]:
from tqdm import tqdm

# Create the graph
G = nx.Graph()
num_docs = reddit_df.shape[0]
total_comparisons = num_docs * (num_docs - 1) // 2

reddit.build_lsh(threshold=0.4, num_perm=4048)  # Lower threshold for general similarity

# Add edges to the graph based on similarity scores
for i, j in tqdm(reddit.compute_similarity_lsh(), total=total_comparisons, desc="Building Graph"):
    G.add_edge(i, j)

# Perform Louvain community detection
partition = louvain_communities(G, resolution=1)

# Print the community detection results
print("\nLouvain Community Detection Result:")
print(partition)

print("\nModularity:", modularity(G, partition))

Building Graph:   0%|          | 11005/38206911 [00:05<4:54:35, 2160.90it/s]



Louvain Community Detection Result:
[{1792, 4097, 2050, 4355, 4, 1285, 765, 3851, 12, 1813, 3606, 2077, 6176, 6692, 1316, 5417, 6185, 2089, 5932, 45, 4650, 2094, 4405, 5943, 3898, 59, 5693, 3904, 832, 66, 3405, 6222, 1614, 4176, 5197, 3162, 5979, 2155, 3186, 2675, 3443, 6517, 4215, 5501, 2942, 904, 5515, 3981, 3988, 6037, 3997, 5789, 4006, 3497, 6571, 4526, 4270, 4272, 693, 6327, 4279, 4284, 4030, 4288, 5313, 1220, 4297, 4810, 4301, 4304, 4049, 210, 2258, 4051, 5591, 4055, 2776, 6619, 5344, 6371, 4325, 6377, 4329, 3819, 3832, 1532, 4349}, {10, 479}, {2920, 11}, {897, 1026, 2823, 394, 3083, 780, 14, 5647, 16, 5263, 6546, 5267, 1559, 2078, 6689, 3509, 1720, 5818, 3650, 3657, 1098, 2123, 4692, 1877, 2264, 5594, 1627, 5210, 6365, 2397, 233, 4713, 1006, 1779, 3060, 5371, 3709, 3839}, {4865, 6403, 4869, 5135, 4625, 6162, 1042, 21, 6690, 1314, 4386, 5922, 2299, 5683, 3380, 2871, 314, 1596, 5949, 5951, 74, 1359, 5200, 1363, 1109, 856, 3071, 3422, 4961, 5217, 6244, 5486, 6265, 122, 3196, 3459,

In [4]:
print("\nNumber of Communities:", len(partition))


Number of Communities: 365


In [5]:
# Assign nodes to their corresponding communities
import netwulf as nw
import matplotlib.pyplot as plt
node_to_community = {}
for community_index, community in enumerate(partition):
    for node in community:
        node_to_community[node] = community_index

from random import randint, seed
seed(0)
colors = ['#%06X' % randint(0, 0xFFFFFF) for _ in range(len(partition))]

# Add the community data to each node as an attribute
for node in G.nodes():
    G.nodes[node]['group'] = node_to_community[node]

for n1, n2, data in G.edges(data=True): # set alpha to 0.1 for all edges
    data['weight'] = 0.1    

print("Number of nodes:", G.nodes(data=True))

nw.visualize(G, config={'zoom': 0.1, 'node_fill_color': [colors[data['group']] for _, data in G.nodes(data=True)], 'alpha':0.1, 'collisions': False})

Number of nodes: [(2, {'group': 79}), (5378, {'group': 79}), (325, {'group': 79}), (5389, {'group': 79}), (6643, {'group': 79}), (4216, {'group': 135}), (4, {'group': 0}), (1792, {'group': 0}), (6176, {'group': 0}), (1220, {'group': 0}), (5417, {'group': 0}), (5932, {'group': 0}), (12, {'group': 0}), (1917, {'group': 65}), (3997, {'group': 0}), (2258, {'group': 0}), (210, {'group': 0}), (3606, {'group': 0}), (6327, {'group': 0}), (5789, {'group': 0}), (2942, {'group': 0}), (5, {'group': 298}), (2308, {'group': 298}), (4197, {'group': 298}), (6154, {'group': 298}), (4877, {'group': 135}), (3887, {'group': 298}), (7, {'group': 293}), (929, {'group': 293}), (4547, {'group': 293}), (1683, {'group': 293}), (5662, {'group': 293}), (8, {'group': 79}), (473, {'group': 79}), (4786, {'group': 79}), (9, {'group': 190}), (2305, {'group': 190}), (2244, {'group': 190}), (581, {'group': 55}), (4031, {'group': 190}), (10, {'group': 1}), (479, {'group': 1}), (11, {'group': 2}), (2920, {'group': 2}), (1

The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


({'xlim': [0, 765],
  'ylim': [0, 765],
  'linkColor': '#7c7c7c',
  'linkAlpha': 0.5,
  'nodeStrokeColor': '#555555',
  'nodeStrokeWidth': 0.6392748787206075,
  'links': [{'source': 2,
    'target': 5378,
    'width': 1.278549757441215,
    'weight': 0.1},
   {'source': 2, 'target': 325, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 2, 'target': 5389, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 2, 'target': 6643, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 2, 'target': 4216, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 5378, 'target': 1755, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 5378, 'target': 4216, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 5378, 'target': 4642, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 5378, 'target': 5120, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 5378, 'target': 6311, 'width': 1.278549757441215, 'weight': 0.1},
   {'source': 5378, 'target': 5389, 

In [None]:
# Show 3347 and 3175 from the dataframe
print(reddit_df.iloc[209])
print(reddit_df.iloc[182])


stock                                                    Apple
timestamp                                  2023-06-05 20:44:00
title        I see investors are real happy about Apple pul...
body         I see investors are real happy about Apple pul...
source                                                  reddit
cleaned      i see investors are real happy about apple pul...
Name: 209, dtype: object
stock                                                    Apple
timestamp                                  2024-08-09 04:50:42
title        How many of you bought the dip and quit wendy’s? 
body         How many of you bought the dip and quit wendy’...
source                                                  reddit
cleaned      how many of you bought the dip and quit wendys...
Name: 182, dtype: object


In [None]:
print(reddit_df.columns)
print(stock_df.columns)

Index(['stock', 'timestamp', 'title', 'body', 'source', 'cleaned'], dtype='object')
Index(['Unnamed: 0', 'Date', 'AAPL', 'MSFT', 'NVDA', 'TSLA', 'AMZN', 'GOOGL',
       'META'],
      dtype='object')


<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


### Stock Change
- Requires 'partition' variable
- Requires reddit_df
- Requires stock_df

In [55]:
def average_gradient(df, stock, date_time, t):
    """
    Calculate the average gradient for a given stock around a specified date and time.
    
    Parameters:
    - df (DataFrame): The dataframe containing Date, stock prices as columns.
    - stock (str): The stock symbol for which to calculate the gradient (e.g., 'AAPL').
    - date_time (str or datetime): The reference date and time as a string or datetime object.
    - t (int): Number of timesteps before and after the date_time to calculate the gradient.
    
    Returns:
    - float: The average gradient (price change per timestep).
    """
    # Ensure date_time is in datetime format
    date_time = pd.to_datetime(date_time)

    #print("Important type check:", type(date_time), type(df['Date'][0]))
    
    # Determine indices for `t` steps before and `t` steps after, even if `date_time` is not in the data
    before_indices = df.index[df['Date'] <= date_time].to_numpy()[-(t[0]+1):]  # Last `t` steps before or equal
    after_indices = df.index[df['Date'] > date_time].to_numpy()[:t[1]]      # First `t` steps after

    if len(before_indices) < t[0]+1 or len(after_indices) < t[1]:
        return None, None
    
    #print(after_indices, before_indices)
    after_indices = np.sort(np.append(after_indices, before_indices[-1]))
    #print(after_indices)

    before_prices = df.loc[before_indices, stock].to_numpy()
    avg_gradient_before = np.gradient(before_prices).mean()

    after_prices = df.loc[after_indices, stock]
    avg_gradient_after = np.gradient(after_prices).mean()

    return avg_gradient_before, avg_gradient_after

stock_dict = {
    "Apple": "AAPL",
    "Microsoft": "MSFT",
    "NVIDIA": "NVDA",
    "Tesla": "TSLA",
    "Amazon": "AMZN",
    "Alphabet": "GOOGL",
    "Meta": "META"
}

def get_stock_gradient_change_reddit(stock_df, reddit_df_series, t=(2, 2)):
    # Get the stock symbol and date from the reddit data
    if reddit_df_series['stock'] not in stock_dict:
        return None

    stock_input = reddit_df_series['stock']

    if stock_input in stock_dict.values():
        stock = stock_input  # It's already a stock symbol
    elif stock_input in stock_dict:
        stock = stock_dict[stock_input]  # Convert company name to stock symbol
    else:
        return None  # Not a valid stock symbol or company name


    date_time = reddit_df_series['timestamp']

    average_gradient_before, average_gradient_after = average_gradient(stock_df, stock, date_time, t)

    if average_gradient_before is None or average_gradient_after is None:
        return None

    return average_gradient_after - average_gradient_before

def populate_reddit_df_with_stock_gradient_change(stock_df, reddit_df):
    reddit_df['stock_gradient_change'] = reddit_df.apply(lambda row: get_stock_gradient_change_reddit(stock_df, row), axis=1)
    


In [56]:
'''
    Important: The following code requires stock_df with a 'Date' column in datetime format.
'''
stock_df['Date'] = pd.to_datetime(stock_df['Date']) # Ensure the date is in datetime format

'''
    Example
'''
print("Number of communities:", len(partition))

example_community = list(partition[0])
print("Size of the first community:", len(example_community))

example = reddit_df.iloc[example_community[0]]
print("Information for the first node in the first community:")
print(type(example))
print(example)
print("(end)")

# Calculate the average gradient change for the stock around the time of the first node
stock_change = get_stock_gradient_change_reddit(stock_df, example)

print("Stock change for the first node in the first community:", stock_change)


Number of communities: 365
Size of the first community: 87
Information for the first node in the first community:
<class 'pandas.core.series.Series'>
stock                                                            Microsoft
timestamp                                              2022-05-31 17:06:53
title                    The Nature of Predators 16 [First](https://www...
body                     The Nature of Predators 16 [First](https://www...
source                                                              reddit
cleaned                  the nature of predators 16 first prev next mem...
stock_gradient_change                                                 0.96
Name: 1792, dtype: object
(end)
Stock change for the first node in the first community: 0.960000000000008


In [60]:
populate_reddit_df_with_stock_gradient_change(stock_df, reddit_df)

example = reddit_df.iloc[example_community[0]]

print(example['stock_gradient_change'])

0.960000000000008


In [None]:
# Calculate the average stock gradient change for each community
community_sorted = sorted(partition, key=len, reverse=True)

community_stock_changes = []
for community in partition:
    community_stock_changes.append(reddit_df.iloc[list(community)]['stock_gradient_change'].mean())

[np.float64(-0.01307692307692717), np.float64(-2.197500000000005), np.float64(0.832499999999996), np.float64(-0.08322580645161422), np.float64(-0.6731249999999991), np.float64(1.986666666666655), np.float64(-0.5100000000000051), np.float64(-1.9650000000000034), np.float64(3.5700000000000003), np.float64(-3.322499999999998), np.float64(-1.3854545454545444), np.float64(0.5099999999999815), np.float64(-0.5966666666666735), np.float64(-0.47299999999999043), np.float64(0.05000000000000426), np.float64(0.5699999999999908), np.float64(-0.8224999999999945), np.float64(0.1872727272727298), np.float64(-4.57500000000001), np.float64(-0.09250000000001535), np.float64(-0.3275000000000077), np.float64(2.291666666666662), np.float64(-0.7824999999999989), np.float64(0.5294444444444429), np.float64(-1.1149999999999949), np.float64(-6.032499999999999), np.float64(-1.188333333333328), np.float64(2.7249999999999943), np.float64(-0.19999999999999574), np.float64(-1.757000000000005), np.float64(1.3200000000