In [None]:
import praw
import pandas as pd
from datetime import datetime
from tqdm import tqdm

# Initialize Reddit API
reddit = praw.Reddit(
    client_id='',
    client_secret='',
    user_agent=''
)

# Full list of search terms
search_terms = [
    "flu", "fever", "rash", "covid", "epidemic",
    "cough", "RSV", "vaccine", "pneumonia", "chills",
    "diarrhea", "virus", "fatigue", "contagious", "illness"
]

reddit_data = []

# Loop through queries with progress bar
for query in tqdm(search_terms, desc="🔍 Fetching Reddit posts"):
    for submission in reddit.subreddit('health').search(query, limit=150, sort='new'):
        reddit_data.append({
            "query": query,
            "title": submission.title,
            "body": submission.selftext,
            "score": submission.score,
            "subreddit": submission.subreddit.display_name,
            "created_utc": datetime.utcfromtimestamp(submission.created_utc),
            "url": submission.url,
            "platform": "Reddit"
        })

# Create DataFrame
reddit_df = pd.DataFrame(reddit_data)
reddit_df = reddit_df.rename(columns={'created_utc': 'timestamp'})
reddit_df["text"] = reddit_df["title"] + " " + reddit_df["body"]
reddit_df = reddit_df[["query", "timestamp", "platform", "text", "score", "url"]]

print(f"\n✅ Reddit posts collected: {len(reddit_df)}")
print(reddit_df.head())


🔍 Fetching Reddit posts: 100%|████████████████████████████████████████████████████████| 15/15 [00:45<00:00,  3.02s/it]


✅ Reddit posts collected: 2041
  query           timestamp platform  \
0   flu 2025-04-14 14:48:16   Reddit   
1   flu 2025-04-14 09:20:59   Reddit   
2   flu 2025-04-12 20:28:33   Reddit   
3   flu 2025-04-09 10:09:46   Reddit   
4   flu 2025-04-08 18:30:32   Reddit   

                                                text  score  \
0  A new biosensor can detect bird flu in five mi...     13   
1  Measles, beef fat, and bird flu: What European...     25   
2  North Carolina flu-related deaths at all-time ...     37   
3  3-year-old girl in Mexico dies of bird flu in ...     25   
4        Bird flu: Mexico reports first human death      44   

                                                 url  
0  https://www.technologyreview.com/2025/04/09/11...  
1  https://www.euronews.com/health/2025/04/12/mea...  
2  https://www.nbcnews.com/news/us-news/north-car...  
3  https://www.euronews.com/health/2025/04/09/3-y...  
4  https://www.ctvnews.ca/health/article/mexico-r...  





In [None]:
import pandas as pd
import spacy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import folium
from folium.plugins import HeatMap
from tqdm import tqdm

# Assuming reddit_df is already defined (example: loaded from a CSV or database)
# Example: reddit_df = pd.read_csv("reddit_data.csv")

# NLP and Geocoding setup
nlp = spacy.load("en_core_web_sm")
geolocator = Nominatim(user_agent="epi_geo")

def extract_location(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    return locations[0] if locations else None

def geocode_location(location):
    try:
        loc = geolocator.geocode(location, timeout=10)
        if loc:
            return (loc.latitude, loc.longitude)
    except GeocoderTimedOut:
        pass
    return None

# Initialize tqdm progress bar
tqdm.pandas(desc="🌍 Extracting and Geocoding Locations")

# Extract locations and geocode them with progress bar
reddit_df["location"] = reddit_df["text"].progress_apply(extract_location)
reddit_df["coords"] = reddit_df["location"].progress_apply(lambda loc: geocode_location(loc) if pd.notnull(loc) else None)

# Filter out rows without valid coordinates
reddit_df = reddit_df[reddit_df["coords"].notnull()]

# Create heatmap
heatmap_data = reddit_df["coords"].tolist()
m = folium.Map(location=[20, 0], zoom_start=2)
HeatMap(heatmap_data).add_to(m)
m.save("reddit_heatmap.html")

print(f"\n📍 Geo-located posts: {len(reddit_df)}")
print("✅ Heatmap saved as: reddit_heatmap.html")


🌍 Extracting and Geocoding Locations: 100%|███████████████████████████████████████| 2041/2041 [00:51<00:00, 39.35it/s]
🌍 Extracting and Geocoding Locations:  89%|██████████████████████████████████▊    | 1819/2041 [08:46<01:10,  3.14it/s]