# *Exercise 2: Reddit API Data Collection & Sentiment Analysis*
### Objective: Collect post and comment data from political subreddits using the Reddit API (PRAW), identify the most common posts and their comments

In [36]:
pip install praw

Note: you may need to restart the kernel to use updated packages.


In [None]:
import praw
import pandas as pd
from pandas import read_csv
reddit = praw.Reddit(
    client_id="80zoYkpK0w1Mhd9Wndupsw",
    client_secret="WsiRw3nw1Qi7ib4aZ7tP8AdVBw5mUw",
    password="Minina15.@",
    user_agent="Unable-Example-9251",
    username="Unable-Example-9251",
)


In [59]:
print(reddit.user.me())

Unable-Example-9251


In [65]:

from pathlib import Path
from dotenv import load_dotenv
import os
import pandas as pd
import praw
import prawcore

# ===========================================================
# 2) Función: obtener EXACTAMENTE 'limit' posts por subreddit
#    - Ignora hilos stickied
#    - Sobremuestrea y usa un while hasta completar 'limit'
# ===========================================================
def fetch_subreddit_posts_exact(
    name: str,
    limit: int = 20,
    mode: str = "hot",              # 'hot' | 'top'
    time_filter: str = "month",     # si mode='top': 'day'|'week'|'month'|'year'|'all'
    start_factor: int = 3,          # factor de sobremuestreo inicial
    max_factor: int = 10            # límite superior para evitar loops excesivos
) -> pd.DataFrame:
    """
    Devuelve un DataFrame con EXACTAMENTE 'limit' posts válidos (sin stickies),
    o menos si realmente no hay suficientes posts disponibles.
    """
    rows = []
    seen = set()   # evita duplicados por id
    try:
        sub = reddit.subreddit(name)
        factor = start_factor

        while len(rows) < limit and factor <= max_factor:
            # Sobremuestrea (p.ej., 20 * 3 = 60) y corta al alcanzar 'limit'
            if mode == "hot":
                listing = sub.hot(limit=limit * factor)
            else:
                listing = sub.top(time_filter=time_filter, limit=limit * factor)

            for s in listing:
                if getattr(s, "stickied", False):
                    continue
                if s.id in seen:
                    continue
                rows.append({
                    "subreddit": name,
                    "title": s.title,
                    "score": int(s.score),
                    "num_comments": int(s.num_comments),
                    "id": s.id,
                    "url": s.url,
                })
                seen.add(s.id)
                if len(rows) >= limit:
                    break

            if len(rows) < limit:
                # si aún faltan, aumenta el factor y vuelve a intentar
                factor *= 2

        # Recorta por si, en el último batch, pasamos de 'limit'
        rows = rows[:limit]
        return pd.DataFrame(rows)

    except prawcore.exceptions.ResponseException as e:
        code = getattr(e.response, "status_code", None)
        msg = f"[HTTP {code}] Error en r/{name}"
        if code == 401:
            msg += " (credenciales requeridas: username/password o refresh token)"
        print(msg)
        return pd.DataFrame()

    except Exception as e:
        print(f"[Error] r/{name}: {e!r}")
        return pd.DataFrame()

In [66]:
# ============================================================
# 3) Recolectar los 3 subreddits y unirlos en un solo DataFrame
# ============================================================
targets = ["politics", "PoliticalDiscussion", "worldnews"]
MODE = "hot"          # o "top"
TIME_FILTER = "month" # si MODE == 'top': 'day'|'week'|'month'|'year'|'all'

dfs = []
for name in targets:
    df_one = fetch_subreddit_posts_exact(
        name=name,
        limit=20,
        mode=MODE,
        time_filter=TIME_FILTER,
        start_factor=3,
        max_factor=10
    )
    print(f"r/{name} -> {len(df_one)} filas")
    dfs.append(df_one)

posts_df = pd.concat(dfs, ignore_index=True)

# Quita duplicados por seguridad (subreddit, id)
posts_df.drop_duplicates(subset=["subreddit", "id"], inplace=True)

print("Total filas unificadas:", len(posts_df))
print(posts_df.head(3))

r/politics -> 20 filas
r/PoliticalDiscussion -> 20 filas
r/worldnews -> 20 filas
Total filas unificadas: 60
  subreddit                                              title  score  \
0  politics        Trump, 79, Goes on Bizarre AI Posting Spree   4781   
1  politics  Author Stephen King says people will deny they...   2067   
2  politics  Mike Johnson Totally Deflects When Asked About...   1567   

   num_comments       id                                                url  
0           473  1n4u384  https://www.thedailybeast.com/trump-79-goes-on...  
1           200  1n4x1nn  https://www.the-independent.com/bulletin/cultu...  
2            68  1n4uee7  https://newrepublic.com/post/199857/mike-johns...  


In [64]:
# Subreddits objetivo
targets = ["politics", "PoliticalDiscussion", "worldnews"]

# Modo de recolección (elige uno)
MODE = "hot"          # o "top"
TIME_FILTER = "month" # si MODE=="top": 'day'|'week'|'month'|'year'|'all'

# Recolección individual
dfs = []
for name in targets:
    df_one = fetch_subreddit_posts(name, limit=20, mode=MODE, time_filter=TIME_FILTER)
    print(f"r/{name} -> {len(df_one)} filas")
    dfs.append(df_one)

# Unión en un solo DataFrame
posts_df = pd.concat(dfs, ignore_index=True)

# Opcional: de-duplicar por (subreddit, id), por si repites ejecuciones
posts_df.drop_duplicates(subset=["subreddit", "id"], inplace=True)

print("Total filas unificadas:", len(posts_df))
posts_df.head()


r/politics -> 19 filas
r/PoliticalDiscussion -> 18 filas
r/worldnews -> 19 filas
Total filas unificadas: 56


Unnamed: 0,subreddit,title,score,num_comments,id,url
0,politics,"Trump, 79, Goes on Bizarre AI Posting Spree",3877,390,1n4u384,https://www.thedailybeast.com/trump-79-goes-on...
1,politics,Author Stephen King says people will deny they...,927,91,1n4x1nn,https://www.the-independent.com/bulletin/cultu...
2,politics,Mike Johnson Totally Deflects When Asked About...,1306,64,1n4uee7,https://newrepublic.com/post/199857/mike-johns...
3,politics,Newsom Trolls Trump Over Bruised Hands and Gol...,1362,95,1n4svh5,https://www.thedailybeast.com/newsom-trolls-tr...
4,politics,Netflix co-founder drops $2 million into Gavin...,31949,569,1n4fe1w,https://www.politico.com/news/2025/08/29/netfl...


In [67]:
# =======================================
# 4) Guardar en CSV en la carpeta /output
posts_df.to_csv("../output/Reddit.csv", index=False) # Lo mandamos a nuestra carpeta output

In [69]:
posts_df.count()
posts_df.head(5)

Unnamed: 0,subreddit,title,score,num_comments,id,url
0,politics,"Trump, 79, Goes on Bizarre AI Posting Spree",4781,473,1n4u384,https://www.thedailybeast.com/trump-79-goes-on...
1,politics,Author Stephen King says people will deny they...,2067,200,1n4x1nn,https://www.the-independent.com/bulletin/cultu...
2,politics,Mike Johnson Totally Deflects When Asked About...,1567,68,1n4uee7,https://newrepublic.com/post/199857/mike-johns...
3,politics,Newsom Trolls Trump Over Bruised Hands and Gol...,1489,96,1n4svh5,https://www.thedailybeast.com/newsom-trolls-tr...
4,politics,Netflix co-founder drops $2 million into Gavin...,32115,572,1n4fe1w,https://www.politico.com/news/2025/08/29/netfl...


In [None]:
def fetch_top_comments_for_post(post_id: str, n: int = 5): # Habia un comentario de advertencia de un bot
    comments_data = []
    try:
        submission = reddit.submission(id=post_id)
        submission.comments.replace_more(limit=0)

        for c in submission.comments[:n*2]:  # tomo más y filtro después
            # Ignorar comentarios vacíos, de bots o moderación
            if not c.body or c.body.lower().startswith(("users often report", "[removed]", "[deleted]")):
                continue

            comments_data.append({
                "post_id": post_id,
                "body": c.body.strip(),
                "score": c.score,
            })

            # rompo cuando ya tengo n válidos
            if len(comments_data) >= n:
                break

    except Exception as e:
        print(f"[Error comentarios] Post {post_id}: {e!r}")
    return comments_data


In [78]:
df = read_csv("../output/Reddit.csv")
df_1 = df.sort_values(by=['score'],ascending=False).head(5)

In [None]:
all_comments = []
for pid in df_1["id"]:
    coms = fetch_top_comments_for_post(str(pid), n=5)
    all_comments.extend(coms)

comments_df = pd.DataFrame(all_comments)

comments_df.head(5)


Unnamed: 0,post_id,body,score
0,1n457kx,"Ah yes, the sign of a true winner: obsessively...",16365
1,1n457kx,Trump invited modi over to white house when Mo...,5236
2,1n457kx,First candidate in history to beg for noble prize,2234
3,1n457kx,Nobel POS prize.,518
4,1n457kx,What a petulant child,1364


In [86]:
comments_df.count()

post_id    25
body       25
score      25
dtype: int64

In [87]:
comments_df.to_csv("../output/reddit_comments.csv", index=False) # Lo mandamos a nuestra carpeta output

In [91]:
comments_df.head(25)

Unnamed: 0,post_id,body,score
0,1n457kx,"Ah yes, the sign of a true winner: obsessively...",16365
1,1n457kx,Trump invited modi over to white house when Mo...,5236
2,1n457kx,First candidate in history to beg for noble prize,2234
3,1n457kx,Nobel POS prize.,518
4,1n457kx,What a petulant child,1364
5,1n4fe1w,"As a reminder, this subreddit [is for civil di...",1
6,1n4fe1w,Good but other democratic states better start ...,4210
7,1n4fe1w,Excellent!!,1143
8,1n4fe1w,Do Democrats even stand a chance if all red st...,426
9,1n4fe1w,I hate the concept of corporate money in polit...,115


In [98]:
posts_df = pd.read_csv("../output/Reddit.csv")

# Chequeo rápido: ¿todos los post_id de comments existen en posts?
missing = set(comments_df["post_id"]) - set(posts_df["id"])
print(f"Post_id sin match en posts_df: {len(missing)}")

comments_with_post = comments_df.merge(
    posts_df[["id", "subreddit", "title", "url"]],  # agregamos url
    left_on="post_id", right_on="id", how="left"
).drop(columns=["id"])

comments_with_post.head(25)


Post_id sin match en posts_df: 0


Unnamed: 0,post_id,body,score,subreddit,title,url
0,1n457kx,"Ah yes, the sign of a true winner: obsessively...",16365,worldnews,"Trump asked Modi for Nobel backing, his no bro...",https://www.financialexpress.com/business/modi...
1,1n457kx,Trump invited modi over to white house when Mo...,5236,worldnews,"Trump asked Modi for Nobel backing, his no bro...",https://www.financialexpress.com/business/modi...
2,1n457kx,First candidate in history to beg for noble prize,2234,worldnews,"Trump asked Modi for Nobel backing, his no bro...",https://www.financialexpress.com/business/modi...
3,1n457kx,Nobel POS prize.,518,worldnews,"Trump asked Modi for Nobel backing, his no bro...",https://www.financialexpress.com/business/modi...
4,1n457kx,What a petulant child,1364,worldnews,"Trump asked Modi for Nobel backing, his no bro...",https://www.financialexpress.com/business/modi...
5,1n4fe1w,"As a reminder, this subreddit [is for civil di...",1,politics,Netflix co-founder drops $2 million into Gavin...,https://www.politico.com/news/2025/08/29/netfl...
6,1n4fe1w,Good but other democratic states better start ...,4210,politics,Netflix co-founder drops $2 million into Gavin...,https://www.politico.com/news/2025/08/29/netfl...
7,1n4fe1w,Excellent!!,1143,politics,Netflix co-founder drops $2 million into Gavin...,https://www.politico.com/news/2025/08/29/netfl...
8,1n4fe1w,Do Democrats even stand a chance if all red st...,426,politics,Netflix co-founder drops $2 million into Gavin...,https://www.politico.com/news/2025/08/29/netfl...
9,1n4fe1w,I hate the concept of corporate money in polit...,115,politics,Netflix co-founder drops $2 million into Gavin...,https://www.politico.com/news/2025/08/29/netfl...


In [99]:
comments_with_post.to_csv("../output/reddit_comments_with_post.csv", index=False) # Lo mandamos a nuestra carpeta output

In [97]:
comments_with_post.columns.to_list()

['post_id', 'body', 'score', 'subreddit', 'title']