# Detecting Communities in the Reddit Hyperlink Dataset
## For ECMM447 - Social Networks and Text Analysis



In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import urllib.request as req
import os.path as path

from collections.abc import Iterable
from networkx.algorithms import community

In [3]:
# Downloads the Reddit Hyperlink dataset TSV file and saves it to ./data/
url = 'https://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv'
body_save = "./data/soc-redditHyperlinks-body.tsv"
print("Checking for 'soc-redditHyperlinks-body.tsv'...")
if not path.exists(body_save):
    print("File not Found.")
    print(f"Downloading 'soc-redditHyperlinks-body.tsv' from URL: '{url}'...")
    req.urlretrieve(url, body_save)
else:
    print(f"Found at Path: '{body_save}'!")

url = 'https://snap.stanford.edu/data/soc-redditHyperlinks-title.tsv'
title_save = "./data/soc-redditHyperlinks-title.tsv"
print("Checking for 'soc-redditHyperlinks-title.tsv'...")
if not path.exists(title_save):
    print("File not Found.")
    print(f"Downloading 'soc-redditHyperlinks-title.tsv' from URL: '{url}'...")
    req.urlretrieve(url, title_save)
    print("Done!")
else:
    print(f"Found at Path: '{title_save}'!")
    
title_path = title_save
body_path = body_save

# Load the TSV files and convert to CSV files
title_df = pd.read_table(title_path, sep="\t")
title_csv = title_path[:-4] + ".csv"
print(f"Converting TSV to CSV file at: '{title_csv}'...")
title_df.to_csv(title_csv)
print("Converted!")

body_df = pd.read_table(body_path, sep='\t')
body_csv = body_path[:-4] + ".csv"
print(f"Converting TSV to CSV file at: '{body_csv}'...")
body_df.to_csv(body_csv)
print("Converted!")

# Load CSV files into DataFrames, concatenate them and extract Source/Target nodes
body_df = pd.read_csv(body_csv)
title_df = pd.read_csv(title_csv)

print("Creating edgelist DataFrame...")
reddit_df = pd.concat([title_df, body_df]).reset_index(drop=True)
reddit_df = reddit_df[["SOURCE_SUBREDDIT", "TARGET_SUBREDDIT"]]
print("Done!")

reddit_path = "./data/redditHyperlinks-subredditsOnly.csv"
print(f"Saving edge details to CSV file at: '{reddit_path}'...")
reddit_df.to_csv(reddit_path, index=False)
print("Saved!")

Checking for 'soc-redditHyperlinks-body.tsv'...
Found at Path: './data/soc-redditHyperlinks-body.tsv'!
Checking for 'soc-redditHyperlinks-title.tsv'...
Found at Path: './data/soc-redditHyperlinks-title.tsv'!
Converting TSV to CSV file at: './data/soc-redditHyperlinks-title.csv'...
Converted!
Converting TSV to CSV file at: './data/soc-redditHyperlinks-body.csv'...
Converted!
Creating edgelist DataFrame...
Done!
Saving edge details to CSV file at: './data/redditHyperlinks-subredditsOnly.csv'...
Saved!
