In [6]:
import gzip

graph_file = "../../YouTube_with_communities/com-youtube.ungraph.txt.gz"
community_file = "../../YouTube_with_communities/com-youtube.top5000.cmty.txt.gz"
output_file = "youtube_top5000_subgraph.txt"
comm_output_file = "communities_youtube_filtered.txt"

In [7]:
# Filter communities with >= 5 members and collect node sets
print("Loading top5000 communities (filter size < 5)...")
top_nodes = set()
valid_comm_count = 0
filtered_communities = []  

with gzip.open(community_file, "rt", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) < 5:
            continue 
        valid_comm_count += 1
        comm_nodes = [int(node) for node in parts]
        filtered_communities.append(comm_nodes)
        top_nodes.update(comm_nodes)

print(f"Collected {len(top_nodes)} unique nodes from {valid_comm_count} valid communities")

print("Filtering edges from original graph...")
edges = []
with gzip.open(graph_file, "rt", encoding="utf-8") as f:
    for line in f:
        if line.startswith("#"): 
            continue
        u, v = map(int, line.strip().split())
        if u in top_nodes and v in top_nodes:
            edges.append((u, v))

filtered_nodes = set()
for u, v in edges:
    filtered_nodes.add(u)
    filtered_nodes.add(v)

print(f"Filtered graph: {len(filtered_nodes)} nodes, {len(edges)} edges")

with open(output_file, "w", encoding="utf-8") as f:
    for u, v in edges:
        f.write(f"{u}\t{v}\n")
print(f"Subgraph saved to {output_file}")

with open(comm_output_file, "w", encoding="utf-8") as f:
    for comm in filtered_communities:
        f.write("\t".join(map(str, comm)) + "\n")
print(f"Filtered communities saved to {comm_output_file}")

Loading top5000 communities (filter size < 5)...
Collected 34861 unique nodes from 2259 valid communities
Filtering edges from original graph...
Filtered graph: 34861 nodes, 216626 edges
Subgraph saved to youtube_top5000_subgraph.txt
Filtered communities saved to communities_youtube_filtered.txt


In [8]:

input_comm_file = "communities_youtube_filtered.txt"
output_node2comm = "node2comm_youtube.txt"

node2comm = {}

with open(input_comm_file, "r", encoding="utf-8") as f:
    for comm_id, line in enumerate(f):
        nodes = line.strip().split()
        for node in nodes:
            node2comm[int(node)] = comm_id

with open(output_node2comm, "w", encoding="utf-8") as f:
    for node, comm_id in node2comm.items():
        f.write(f"{node}\t{comm_id}\n")

print(f"Converted {len(node2comm)} nodes into node-community mapping")
print(f"Saved to {output_node2comm}")


Converted 34861 nodes into node-community mapping
Saved to node2comm_youtube.txt
