In [31]:
import pandas as pd
import ast
from google import genai
import praw
import prawcore
import re
import os
import logging
from dotenv import load_dotenv
from geopy.distance import geodesic
from praw.models import MoreComments

In [None]:
def get_comments_string(comments, level=0, printed=None, max_comments=10):
	# Get the comments from a post and return a string of the comments
	if printed is None:
		printed = [0]
	result = []
	indent = "    " * level
	for comment in comments:
		if printed[0] >= max_comments:
			break
		if isinstance(comment, MoreComments):
			continue
		line = f"{indent}- {comment.body}"
		result.append(line)
		printed[0] += 1
		if comment.replies:
			result.append(
				get_comments_string(comment.replies, level + 1, printed, max_comments)
			)
	return "\n".join(result)

def extract_index_list(text):
    # Look for Python-style list inside optional code block markers
	# Used for extracting the list of indices from the LLM output
    match = re.search(r'\[.*?\]', text, re.DOTALL)
    if match:
        try:
            return ast.literal_eval(match.group())
        except Exception as e:
            print("Failed to evaluate list:", e)
    return []

def create_subreddits_df(subreddit_list, reddit, limit_=10):
	# Create a DataFrame of posts from the subreddits
	# This is the first step in the pipeline, where we get all the posts from the subreddits
	posts = []
	for sub in subreddit_list:
		try:
			sub = reddit.subreddit(sub)
			for post in sub.hot(limit=limit_):
				posts.append({
					'subreddit': sub,
					'title': post.title,
					'body': post.selftext,
					'num_comments': post.num_comments,
					'comments': get_comments_string(post.comments.list())
				})
		except prawcore.exceptions.NotFound:
			print(f"Subreddit not found: {sub}")
		except prawcore.exceptions.Forbidden:
			print(f"Access forbidden: {sub}")
		except Exception as e:
			print(f"Other error with {sub}: {e}")
	return pd.DataFrame(posts)


def filter_complaints_df(df, client):
	# Look at just the title and first 100 characters of the body paragraph to screen out irrelevant posts
	# This is the second step in the pipeline, where we use the LLM to screen out irrelevant posts
	outdf = []
	result = client.models.generate_content(
		model='gemini-2.5-flash-lite-preview-06-17',
		contents=f"""Based on this list of post titles from local subreddits, determine if each post might contain information that has the potential for class action litigation based on public nuisance and loss of use/enjoyment. Keep track of only the index (0-indexed) of each relevant title. For example, output 0 if the first post is titled 'Excessive odors coming from local power plant' and nothing for a post titled 'Local barbers in area?' If the title is ambiguous, assume that it is relevant. Output a only list of all the relevant indices, with no other text. Here is the list of titles: {df['title'].tolist()}"""
	)
	indices = extract_index_list(result.text)
	for i in indices:
		if 0 <= i < len(df):
			outdf.append(df.iloc[i])
		else:
			print(f"Warning: Index {i} is out of bounds for DataFrame of length {len(df)}")

	return pd.DataFrame(outdf)


def find_litigation_areas(df, client):
	# Use LLM to find litigation areas given the DataFrame of relevant posts
	# Save DataFrame to CSV
	# This is the third step in the pipeline, where we use the LLM to find litigation areas
	df.to_csv("litigation.csv", index=False)

	# Upload file using its path
	mydf = client.files.upload(file="litigation.csv")

	# Generate content using uploaded file
	result = client.models.generate_content(
		model='gemini-2.5-flash-lite-preview-06-17',
		contents=[
			mydf,
			'''Based on the following table of Reddit posts from this area, identify communities experiencing sustained environmental complaints over the past 3–6 months.

Focus on issues relevant to public nuisance or loss of use/enjoyment, and summarize results briefly to conserve output tokens.

For each issue or cluster, include:

Community name or general area (e.g., "Downtown Palo Alto")

Type of issue (e.g., odor, noise, air pollution)

Estimated residents affected (e.g., 100+, 1000+)

Responsible party, if known

Mitigation mentioned? (Yes/No)

Return the results in a compact, bulleted pointed format. Do not provide explanations or narrative.
Please include as many relevant entries as possible, up to 10.'''
		]
	)

	return result.text


In [33]:
def filter_subreddits_by_distance(center_lat, center_lon, radius_miles):
	# Given a set of coordinates, filter the subreddits to only those within a certain radius
	df = pd.read_csv('files/reddit_data.csv')
	origin = (center_lat, center_lon)

	# Compute distance for each row
	def is_within_radius(row):
		location = (row['Latitude'], row['Longitude'])
		return geodesic(origin, location).miles <= radius_miles

	filtered_df = df[df.apply(is_within_radius, axis=1)].reset_index(drop=True)
	return filtered_df

In [34]:

def keyword_filter(df, keywords, log_file="keyword_filter_log.txt"):
    # Filters DataFrame rows where any keyword appears in title or body.
    # Logs matches and keyword hit counts.
    # Returns filtered DataFrame and a dict of keyword hit counts.
    
    hit_counts = {kw: 0 for kw in keywords}
    matches = []

    with open(log_file, "w") as log:
        for idx, row in df.iterrows():
            title = str(row['title']).lower()
            body = str(row['body']).lower()
            matched_keywords = [kw for kw in keywords if kw in title or kw in body]
            if matched_keywords:
                matches.append(idx)
                for kw in matched_keywords:
                    hit_counts[kw] += 1
                log.write(f"Row {idx}: Matched keywords {matched_keywords} in title/body: \"{row['title']}\"\n")
        log.write("\nKeyword hit counts:\n")
        for kw, count in hit_counts.items():
            log.write(f"{kw}: {count}\n")

    filtered_df = df.loc[matches].reset_index(drop=True)
    return filtered_df, hit_counts

In [35]:
def main(center_lat, center_lon, radius_miles, lim=10):
	# This is the main function that will be called to run all the functions in order
	# First create the reddit and genai using the keys from the .env file
	load_dotenv()
	reddit = praw.Reddit(
		client_id=os.getenv('REDDIT_CLIENT_ID'),
		client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
		user_agent=os.getenv('REDDIT_USER_AGENT')
	)

	client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))

	# Next, filter the list of subreddits by location
	print("Filtering subreddits by location...")
	sub_list = filter_subreddits_by_distance(center_lat, center_lon, radius_miles)['Subreddit'].tolist()
	print("Done.")

	# Grab all the recent posts from those subreddits and condense into dataframe
	print("Fetching posts from subreddits...")
	sub_df = create_subreddits_df(sub_list, reddit, lim)
	print("Done.")

	# Keyword filter before LLM filtering
	KEYWORDS = [
    "odor", "smell", "noise", "loud", "pollution", "toxic", "contamination",
    "smoke", "dust", "vibration", "hazard", "waste", "dump", "spill",
    "illegal", "danger", "health", "asthma", "allergy", "air quality",
    "water quality", "sewage", "garbage", "trash", "rats", "vermin",
    "mold", "leak", "chemical", "factory", "plant", "refinery", "power plant"
	]
	print("Applying keyword filter...")
	keyword_filtered_df, keyword_hits = keyword_filter(sub_df, KEYWORDS)
	print(f"Keyword filter found {len(keyword_filtered_df)} posts. See 'keyword_filter_log.txt' for details.")
	print("Top keyword hits:", {k: v for k, v in keyword_hits.items() if v > 0})

	# Now filter the posts by title to grab only potentially relevant posts
	print("Filtering posts...")
	sub_df2 = filter_complaints_df(sub_df, client)
	print("Done.")

	print("Analyzing posts...")
	# Now analyze the filtered posts and return summary of the complaints
	final = find_litigation_areas(sub_df, client)
	return sub_df, sub_df2, final


In [36]:
# Palo Alto, radius 10
all2, filtered2, final2 = main(37.4419, -122.1430, radius_miles=10)
print(final2)

Filtering subreddits by location...
Done.
Fetching posts from subreddits...
Subreddit not found: losaltoshills
Done.
Applying keyword filter...
Keyword filter found 7 posts. See 'keyword_filter_log.txt' for details.
Top keyword hits: {'smell': 1, 'noise': 1, 'loud': 1, 'dust': 1, 'dump': 1, 'danger': 1, 'health': 2}
Filtering posts...
Done.
Analyzing posts...
* **NewarkCA**
    * **Type of issue:** Noise (trains)
    * **Estimated residents affected:** 1000+
    * **Responsible party:** Unknown (railroad company)
    * **Mitigation mentioned:** Yes (Quiet Zone project, soundproofing, earplugs)
* **SanCarlos**
    * **Type of issue:** Noise (trains)
    * **Estimated residents affected:** 1000+
    * **Responsible party:** Unknown (railroad company)
    * **Mitigation mentioned:** Yes (earplugs, soundproofing)
* **RedwoodCity**
    * **Type of issue:** Noise (trains)
    * **Estimated residents affected:** 1000+
    * **Responsible party:** Unknown (railroad company)
    * **Mitigation 

In [22]:
# Chicago, radius 50
all, filtered, final = main(41.887442, -87.635806, radius_miles=50)
print(final)

Filtering subreddits by location...
Done.
Fetching posts from subreddits...
Access forbidden: Batavia
Subreddit not found: elmwoodpark
Subreddit not found: MichiganCityIndiana
Access forbidden: NorthwestIndiana
Access forbidden: McHenry
Subreddit not found: CrownPointIndiana
Access forbidden: barringtonil
Access forbidden: PortagePark
Other error with roundlake: received 429 HTTP response
Other error with StCharlesIL: received 429 HTTP response
Other error with elgin: received 429 HTTP response
Other error with grayslake: received 429 HTTP response
Other error with auroraillinois: received 429 HTTP response
Other error with michiana: received 429 HTTP response
Other error with oakparkil: received 429 HTTP response
Other error with BartlettIllinois: received 429 HTTP response
Done.
Applying keyword filter...
Keyword filter found 29 posts. See 'keyword_filter_log.txt' for details.
Top keyword hits: {'smell': 1, 'noise': 2, 'loud': 4, 'pollution': 1, 'smoke': 1, 'dust': 4, 'waste': 2, 'sp