In [35]:
import gzip
import json
import os
import time
import networkx as nx
import math

data_folder =  'May 23 Data/data/'

In [36]:
# comments are AI generated, i cba

def load_data(file_path, chatter_thresholds):
    """
    Loads data from a JSON file.

    Args:
        file_path (str): Path to the JSON file.
        chatter_thresholds (int): The chatter threshold.

    Returns:
        dict: The loaded data.
    """
    st = time.time()
    print(f"opening {file_path} with {chatter_thresholds}")
    with gzip.open(file_path, 'rt') as gz_file:
        data = json.load(gz_file)
    et = time.time()
    print(f"data loaded in {(et-st):.2f}s", end="")
    return data

def add_nodes_to_graph(G, data):
    """
    Adds nodes and edges to the graph based on the input data.

    Args:
        G (networkx.Graph): The graph to which nodes and edges will be added.
        data (dict): A dictionary containing user connections.

    Returns:
        networkx.Graph: The updated graph.
    """
    st = time.time()
    count, progress = 0, 1
    print(f"\n00% - adding nodes to graph", end="")
    datalen = len(data)
    for user, connections in data.items():
        count +=1
        G.add_node(user)
        for connection in connections:
            G.add_edge(user, connection)
        if count%(datalen//100) == 0:
            print(f"\r{str(progress).zfill(2)}", end="")
            progress += 1
    et = time.time()
    print(f"\rGraph has added {G.number_of_edges()} edges between {G.number_of_nodes()} nodes in {(et-st):.2f}s\nStarting work... ", end="")
    
    return G

def calculate_thresholds(G, min_degree_threshold):
    """
    Calculates streamer thresholds based on minimum degree.

    Args:
        G (networkx.Graph): The graph containing streamer connections.
        min_degree_threshold (int): The minimum degree required.

    Returns:
        list: List of streamers that meet the degree threshold.
    """
    st = time.time()
    print(f"\n\t00% - Calculating degrees", end="")
    count, progress = 0, 1
    threshold_streamers = []
    numnodes = G.number_of_nodes()
    for node, degree in G.degree():
        count += 1
        if degree > min_degree_threshold:
            threshold_streamers.append(node)
        if count%(numnodes//10) == 0:
            print(f"\r\t{str(progress).zfill(2)}", end="")
            progress += 1
    et = time.time()
    print(f"\r\tCalculated degrees in {(et-st):.2f}s{' ' * 10}", end="")
    return threshold_streamers

def calculate_overlap(threshold_streamers, G):
    """
    Calculates overlap of viewers between streamers.

    Args:
        threshold_streamers (list): List of streamers to analyze.
        G (networkx.Graph): The graph containing streamer connections.

    Returns:
        dict: A dictionary of common viewers between streamers.
    """
    st = time.time()
    common_viewers_dict = {}
    # print(f"\n\tThere are {len(threshold_streamers)} selected", end="")
    print()
    for i, node_i in enumerate(threshold_streamers[:-1]):
        progress = 1
        print(f"\r\t00% - Calculating overlap for {node_i}, streamer {i} out of {len(threshold_streamers)}{' ' * 100}", end="")
        
        numstreamers = len(threshold_streamers[i+1:])
        iset = set(G.neighbors(node_i))
        
        for j, node_j in enumerate(threshold_streamers[i+1:]):
            if j%((numstreamers//100) + 1) == 0:
                progress += 1
                print(f"\r\t{str(progress).zfill(2)}", end="")
            common_neighbors = iset & set(G.neighbors(node_j))
            common_viewers_dict[(node_i, node_j)] = common_neighbors
        # I FIGURED OUT PROGRESS INDICATORS !!!!
        # AND ALSO SUB INDICATORS !!
    et = time.time()
    print(f"\r\tCalculated overlap in {(et-st):.2f}s{' ' * 100}", end="")
    
    return common_viewers_dict

def construct_weighted(weighted_graph, G, common_viewers_dict):
    """
    Constructs a weighted graph based on common viewers between streamers.

    Args:
        weighted_graph (networkx.Graph): The graph to which weighted edges will be added.
        G (networkx.Graph): The original graph containing streamer connections.
        common_viewers_dict (dict): A dictionary of common viewers between streamers.

    Returns:
        networkx.Graph: The weighted graph.
    """
    st = time.time()
    print()
    print(f"\r\t00% - Generating weighted graph", end="")
    count, progress = 0, 1
    numnodespcnt = len(common_viewers_dict) // 100
    for pair, viewers in common_viewers_dict.items():
        count+=1
        if count%numnodespcnt == 0:
            progress += 1
            print(f"\r\t{str(progress).zfill(2)}", end="")
        node1, node2 = pair
        weighted_graph.add_edge(node1, node2, weight=len(viewers))
        weighted_graph.nodes[node1]['viewer_count'] = len(set(G.neighbors(node1)))
        weighted_graph.nodes[node2]['viewer_count'] = len(set(G.neighbors(node2)))
    et = time.time()
    print(f"\r\tGenerated weighted graph in {(et-st):.2f}s{' ' * 100}", end="")
    
    return weighted_graph

def calc_overlap_and_weighted(threshold_streamers, G, weighted_graph, edge_weight_threshold):
    """
    Calculates overlap of viewers and constructs a weighted graph based on common viewers between streamers.

    Args:
        threshold_streamers (list): List of streamers to analyze.
        G (networkx.Graph): The graph containing streamer connections.
        weighted_graph (networkx.Graph): The graph to which weighted edges will be added.
        edge_weight_threshold (float): The threshold for edge weights (0 < edge_weight_threshold < 1).

    Returns:
        networkx.Graph or int: 
            - If edge_weight_threshold is not within (0, 1), returns -1 and prints an error message.
            - Otherwise, returns the weighted graph with added edges based on common viewers.

    Note:
        This function calculates the overlap of viewers between streamers and adds weighted edges to the weighted_graph
        based on the common viewers. It also sets the 'viewer_count' attribute for each node in the weighted graph.
        The edge_weight_threshold controls which common viewers are considered when adding edges.
        Almost twice as fast as doing both separately, AFTER both were optimized from 10min to 3min
    """
    if (edge_weight_threshold >= 1) or (edge_weight_threshold <= 0):
        print(f"edge weight threshold of {edge_weight_threshold} not allowed. It is a fraction, please keep within 0 to 1")
        return -1
    st = time.time()
    print()
    for i, node_i in enumerate(threshold_streamers[:-1]):
        progress = 1
        print(f"\r\t00% - Calculating overlap/weights for {node_i}, streamer {i} out of {len(threshold_streamers)}{' ' * 100}", end="")
        
        numstreamers = len(threshold_streamers[i+1:])
        iset = set(G.neighbors(node_i))
        
        for j, node_j in enumerate(threshold_streamers[i+1:]):
            if j%((numstreamers//100) + 1) == 0:
                progress += 1
                print(f"\r\t{str(progress).zfill(2)}", end="")
            jset = set(G.neighbors(node_j))
            common_neighbors = len(iset & jset)
            if common_neighbors > min(len(iset), len(jset))*(edge_weight_threshold):
                weighted_graph.add_edge(node_i, node_j, weight=common_neighbors)
                weighted_graph.nodes[node_i]['viewer_count'] = len(iset)
                weighted_graph.nodes[node_j]['viewer_count'] = len(jset)
    et = time.time()
    print(f"\r\tCalculated overlap and weight in {(et-st):.2f}s{' ' * 100}", end="")        
    
    return weighted_graph

In [37]:
def analyze_file(data_folder, file_name, chatter_thresholds, edge_weight_threshold):
    """
    Analyzes a file and creates weighted graphs for different chatter thresholds.

    Args:
        file_path (str): Path to the JSON file.
        chatter_thresholds (list): List of chatter thresholds to analyze.

    Returns:
        int: 0 if successful, -1 if there was an error.
    """
    file_path = f"{data_folder}/{file_name}.json.gz"
    data = load_data(file_path, chatter_thresholds)
    # i need to add error handling for returns
    G = nx.Graph()
    G = add_nodes_to_graph(G, data)
    
    for chatter_length in chatter_thresholds:
        st = time.time()
        print(f"\nmaking graph for {chatter_length}", end="")
        min_degree_threshold = chatter_length
        
        threshold_streamers = calculate_thresholds(G, min_degree_threshold)
        # common_viewers_dict = calculate_overlap(threshold_streamers, G)
        
        weighted_graph = nx.Graph()
        # weighted_graph = construct_weighted(weighted_graph, G, common_viewers_dict)
        weighted_graph = calc_overlap_and_weighted(threshold_streamers, G, weighted_graph, edge_weight_threshold)
            
        print(f"\n\tgraph for streamer threshold {chatter_length} has {weighted_graph.number_of_edges()} edges between {weighted_graph.number_of_nodes()} nodes")
        save_path = os.path.join(f"weighted_graphs/{file_name}_{chatter_length}_{edge_weight_threshold}.gml")
        # os.makedirs(f"weighted_graphs/{file_path}", exist_ok=True)
        nx.write_gml(weighted_graph, save_path)

        et = time.time()
        print(f"\nTime taken for {chatter_length} is {(et-st):.2f}s")
    
    return 0

In [38]:
# do i need a separeate calc weighted graph func? cant i just do it in the cal overlap?
# do this later 
edge_weight_threshold = 0.05
chatter_thresholds = [10000, 5000, 1000]

for chatter_length in chatter_thresholds:
    retval = analyze_file(data_folder, "2023-03-01", chatter_thresholds, edge_weight_threshold)
    print(f"returned with value {retval}")

opening May 23 Data/data//2023-03-01.json.gz with [10000, 5000, 1000]
data loaded in 27.40s
Graph has added 12468275 edges between 7695662 nodes in 27.75s
Starting work... 
making graph for 10000
	Calculated degrees in 4.04s          
	Calculated overlap and weight in 147.71s                                                                                                                                                
	graph for streamer threshold 10000 has 945 edges between 257 nodes

Time taken for 10000 is 151.75s

making graph for 5000
	Calculated degrees in 3.93s          
	Calculated overlap and weight in 394.88s                                                                                                                                                
	graph for streamer threshold 5000 has 2311 edges between 541 nodes

Time taken for 5000 is 398.82s

making graph for 1000
	Calculated degrees in 4.16s          
	Calculated overlap and weight in 1038.04s                          

KeyboardInterrupt: 

In [None]:
# def is_corrupted(filename):
#     corrupted_dates = ["2023-03-03", "2023-03-11", "2023-03-14", "2023-03-29"]
    
#     # Extract the date part from the filename
#     date_part = filename.split('.')[0]
    
#     # Check if the date is in the list of corrupted dates
#     return date_part in corrupted_dates


In [None]:
# for filename in os.listdir(data_folder):
#     # print(filename)
#     #<give code to skip the file if its one of the below dates. the file name format is "2023-03-<day of the month>.json.gz">
#     if is_corrupted(filename):
#         print(f"Skipping {filename} because it's a corrupted date.")
#         continue  # Skip this file
#     file_path = os.path.join(data_folder, filename)
#     if filename.endswith('01.json.gz'):
#         analyze_file(file_path)
        
# # preliminary check, files for 3rd, 11th, 14th, and 29th are corrupted (EOF Error). 
# # Will check if its a download issue or upload issue