## __Reddit Platform: Same-sex marriage__

### __General__

#### Libraries

In [3]:
import os, sys, time, json
from datetime import datetime 
from dotenv import load_dotenv
import praw
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

sys.path.append(os.path.abspath('..'))
from utils.helpers import unique_posts_videos 

#### Functions

In [None]:
## Reddit functions
def search_greek_reddit_posts(reddit_object, keywords, limit=10, max_requests_per_min=90):
    """
    Search Reddit for Greek-language posts using a keyword list,
    respecting Reddit's 100 requests/minute API rate limit.
    """
    thread_data = []
    request_counter = 0
    start_time = time.time()

    def check_rate_limit():
        nonlocal request_counter, start_time
        elapsed = time.time() - start_time
        if request_counter >= max_requests_per_min:
            sleep_time = max(0, 60 - elapsed)
            if sleep_time > 0:
                print(f"Rate limit hit. Sleeping for {sleep_time:.1f} seconds...")
                time.sleep(sleep_time)
            start_time = time.time()
            request_counter = 0

    sorting_options = ["relevance", "top", "new"]

    for keyword in keywords:
        for sorting_option in sorting_options:
            check_rate_limit()
            print(f"Searching for keyword: {keyword}")
            try:
                submissions = list(reddit_object.subreddit("greece").search(keyword, sort=sorting_option, limit=limit))
                request_counter += 1 # one API call per keyword
            except Exception as e:
                print(f"Search failed for '{keyword}': {e}")
                continue

            for submission in submissions:
                try:
                    title = submission.title
                    body = submission.selftext or ""
                    author_obj = submission.author
                    username = author_obj.name if author_obj else "[deleted]"

                    if keyword.lower() in title.lower():
                        if detect(title) == "el":
                            print(f"Found Greek post: {title[:50]}...")
                            thread_data.append({
                                "id": submission.id,
                                "title": title,
                                "content": body,
                                "subreddit": str(submission.subreddit),
                                "created_utc": submission.created_utc,
                                "like_count": submission.ups,
                                "num_comments": submission.num_comments,
                                "url": submission.url,
                                "author": str(author_obj),
                                "username": username 
                            })

                except LangDetectException:
                    continue
                except Exception as e:
                    print(f"Error processing submission: {e}")
                    continue

    return thread_data

def fetch_comments_forest(reddit_object, post_ids, max_requests_per_min=90):
    """
    Fetch comments in tree structure with hierarchical IDs for a list of post IDs.
    """
    all_comments = []
    request_counter = 0
    start_time = time.time()

    def check_rate_limit():
        nonlocal request_counter, start_time
        elapsed = time.time() - start_time
        if request_counter >= max_requests_per_min:
            sleep_time = max(0, 60 - elapsed)
            if sleep_time > 0:
                print(f"[COMMENTS] Rate limit hit. Sleeping for {sleep_time:.1f} seconds...")
                time.sleep(sleep_time)
            request_counter = 0
            start_time = time.time()

    def index_comment_tree(comment_forest, prefix=""):
        indexed = []
        for i, comment in enumerate(comment_forest, start=1):
            if not hasattr(comment, "body"):
                continue

            hier_id = f"{prefix}{i}" if prefix == "" else f"{prefix}.{i}"

            indexed.append({
                "hier_id": hier_id,
                "reddit_id": comment.id,
                "author": comment.author.name if comment.author else None,
                "published_at": datetime.fromtimestamp(comment.created_utc).isoformat(),
                "body": comment.body,
                "like_count": comment.ups,
                "parent_id": comment.parent_id,
                "depth": comment.depth,
            })

            if comment.replies:
                indexed.extend(index_comment_tree(comment.replies, prefix=hier_id))
        return indexed

    for post_id in post_ids:
        check_rate_limit()
        try:
            submission = reddit_object.submission(id=post_id)
            request_counter += 1

            submission.comments.replace_more(limit=0)
            comment_tree = submission.comments
            indexed = index_comment_tree(comment_tree)
            all_comments.append({
                "post_id": post_id,
                "comments": indexed
            })

        except Exception as e:
            print(f"Error fetching tree for post {post_id}: {e}")
            continue

    return all_comments

def login_reddit():
    reddit = praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_SECRET'),
        user_agent=os.getenv('USR_AGENT'),
        username=os.getenv('REDDIT_USR'),
        password=os.getenv('REDDIT_PWD')
        )

    print("Logged in as:", reddit.user.me())
    return reddit

#### Initialization

In [None]:
load_dotenv() # load .env project file
reddit_object = login_reddit() 

### __Search posts__

#### Posts/Threads

In [4]:
greek_keywords = ["ομόφυλα ζευγάρια", 
                  "ομόφυλα τεκνοθεσία", 
                  "ισότητα στο πολιτικό γάμο", 
                  "γάμος ομόφυλων", 
                  "γάμος ομόφυλων ζευγαριών"]

In [None]:
posts = search_greek_reddit_posts(
    reddit_object=reddit_object, 
    keywords=greek_keywords, 
    limit=100
    )

In [6]:
clean_data, duplicates = unique_posts_videos(posts, id_key="id")

In [None]:
len(posts), len(clean_data), len(duplicates)

#### Save

Save the final data

In [8]:
save_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("reddit_api.ipynb")))) + "\\outputs\\api_queried\\reddit_api\\reddit_scraped_post.json"

with open(save_path, "w", encoding="utf-8") as f:
    json.dump(posts, f, ensure_ascii=False, indent=4)

Clear system and pycache

In [9]:
### delete reddit login object
del reddit_object

### reset environmental variables
os.environ["REDDIT_CLIENT_ID"] = ""
os.environ["REDDIT_SECRET"] = ""

### __Search comments__

In [None]:
post_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("reddit_api.ipynb")))) + "\\outputs\\api_queried\\reddit_api\\reddit_scraped_post.json"

In [47]:
with open(post_path, "r", encoding="utf-8") as f:
    data = json.load(f)

In [48]:
post_ids = []
for post in data:
    post_ids.append(post["id"])

In [None]:
len(post_ids)

In [None]:
test_comments = fetch_comments_forest(reddit_object=reddit_object, post_ids=post_ids)

In [9]:
comment_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("reddit_api.ipynb")))) + "\\outputs\\api_queried\\reddit_api\\reddit_scraped_comments_2.json"

In [10]:
with open(comment_path, "w", encoding="utf-8") as f:
    json.dump(test_comments, f, ensure_ascii=False, indent=4)