# *Exercise 2: Reddit API Data Collection & Sentiment Analysis*
### Objective: Collect post and comment data from political subreddits using the Reddit API (PRAW), identify the most common posts and their comments

In [1]:
pip install praw

Note: you may need to restart the kernel to use updated packages.


In [2]:
import praw
import pandas as pd
from pandas import read_csv
from pathlib import Path
from dotenv import load_dotenv
import os
import prawcore
reddit = praw.Reddit(
    client_id="80zoYkpK0w1Mhd9Wndupsw",
    client_secret="WsiRw3nw1Qi7ib4aZ7tP8AdVBw5mUw",
    password="Minina15.@",
    user_agent="Unable-Example-9251",
    username="Unable-Example-9251",
)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
print(reddit.user.me())

Unable-Example-9251


In [4]:
# 2) Función: obtener EXACTAMENTE 'limit' posts por subreddit
#    - Ignora hilos stickied
#    - Sobremuestrea y usa un while hasta completar 'limit'
def fetch_subreddit_posts_exact(
    name: str,
    limit: int = 20,
    mode: str = "hot",              # 'hot' | 'top'
    time_filter: str = "month",     # si mode='top': 'day'|'week'|'month'|'year'|'all'
    start_factor: int = 3,          # factor de sobremuestreo inicial
    max_factor: int = 10            # límite superior para evitar loops excesivos
) -> pd.DataFrame:
    """
    Devuelve un DataFrame con EXACTAMENTE 'limit' posts válidos (sin stickies),
    o menos si realmente no hay suficientes posts disponibles.
    """
    rows = []
    seen = set()   # evita duplicados por id
    try:
        sub = reddit.subreddit(name)
        factor = start_factor

        while len(rows) < limit and factor <= max_factor:
            # Sobremuestrea (p.ej., 20 * 3 = 60) y corta al alcanzar 'limit'
            if mode == "hot":
                listing = sub.hot(limit=limit * factor)
            else:
                listing = sub.top(time_filter=time_filter, limit=limit * factor)

            for s in listing:
                if getattr(s, "stickied", False):
                    continue
                if s.id in seen:
                    continue
                rows.append({
                    "subreddit": name,
                    "title": s.title,
                    "score": int(s.score),
                    "num_comments": int(s.num_comments),
                    "id": s.id,
                    "url": s.url,
                })
                seen.add(s.id)
                if len(rows) >= limit:
                    break

            if len(rows) < limit:
                # si aún faltan, aumenta el factor y vuelve a intentar
                factor *= 2

        # Recorta por si, en el último batch, pasamos de 'limit'
        rows = rows[:limit]
        return pd.DataFrame(rows)

    except prawcore.exceptions.ResponseException as e:
        code = getattr(e.response, "status_code", None)
        msg = f"[HTTP {code}] Error en r/{name}"
        if code == 401:
            msg += " (credenciales requeridas: username/password o refresh token)"
        print(msg)
        return pd.DataFrame()

    except Exception as e:
        print(f"[Error] r/{name}: {e!r}")
        return pd.DataFrame()

In [5]:
# ============================================================
# 3) Recolectar los 3 subreddits y unirlos en un solo DataFrame
# ============================================================
targets = ["politics", "PoliticalDiscussion", "worldnews"]
MODE = "hot"          # o "top"
TIME_FILTER = "month" # si MODE == 'top': 'day'|'week'|'month'|'year'|'all'

dfs = []
for name in targets:
    df_one = fetch_subreddit_posts_exact(
        name=name,
        limit=20,
        mode=MODE,
        time_filter=TIME_FILTER,
        start_factor=3,
        max_factor=10
    )
    print(f"r/{name} -> {len(df_one)} filas")
    dfs.append(df_one)

posts_df = pd.concat(dfs, ignore_index=True)

# Quita duplicados por seguridad (subreddit, id)
posts_df.drop_duplicates(subset=["subreddit", "id"], inplace=True)

print("Total filas unificadas:", len(posts_df))
print(posts_df.head(3))

r/politics -> 20 filas
r/PoliticalDiscussion -> 20 filas
r/worldnews -> 20 filas
Total filas unificadas: 60
  subreddit                                              title  score  \
0  politics  Trump faces returning $100bn in tariffs after ...   5096   
1  politics  Bernie Sanders breaks with Democrats and endor...  19153   
2  politics               Donald Trump is weaker than he looks   4562   

   num_comments       id                                                url  
0           250  1n5n1rf  https://www.thetimes.com/article/a09594e1-46f2...  
1           589  1n58uee  https://www.the-independent.com/news/world/ame...  
2           437  1n5d9ku  https://www.reuters.com/commentary/breakingvie...  


In [6]:
# =======================================
# 4) Guardar en CSV en la carpeta /output
posts_df.to_csv("../output/Reddit.csv", index=False) # Lo mandamos a nuestra carpeta output

In [7]:
posts_df.count()
posts_df.head(5)

Unnamed: 0,subreddit,title,score,num_comments,id,url
0,politics,Trump faces returning $100bn in tariffs after ...,5096,250,1n5n1rf,https://www.thetimes.com/article/a09594e1-46f2...
1,politics,Bernie Sanders breaks with Democrats and endor...,19153,589,1n58uee,https://www.the-independent.com/news/world/ame...
2,politics,Donald Trump is weaker than he looks,4562,437,1n5d9ku,https://www.reuters.com/commentary/breakingvie...
3,politics,Donald Trump posting week-old photo raises eye...,562,118,1n5odqt,https://www.newsweek.com/donald-trump-health-p...
4,politics,Judge Tosses D.C. Case From Trump Prosecutor—C...,1521,42,1n5igdm,https://newrepublic.com/post/199856/judge-toss...


In [8]:
def fetch_top_comments_for_post(post_id: str, n: int = 5): # Habia un comentario de advertencia de un bot
    comments_data = []
    try:
        submission = reddit.submission(id=post_id)
        submission.comments.replace_more(limit=0)

        for c in submission.comments[:n*2]:  # tomo más y filtro después
            # Ignorar comentarios vacíos, de bots o moderación
            if not c.body or c.body.lower().startswith(("users often report", "[removed]", "[deleted]")):
                continue

            comments_data.append({
                "post_id": post_id,
                "body": c.body.strip(),
                "score": c.score,
            })

            # rompo cuando ya tengo n válidos
            if len(comments_data) >= n:
                break

    except Exception as e:
        print(f"[Error comentarios] Post {post_id}: {e!r}")
    return comments_data


In [9]:
df = read_csv("../output/Reddit.csv")
df_1 = df.sort_values(by=['score'],ascending=False).head(5)

In [10]:
all_comments = []
for pid in df_1["id"]:
    coms = fetch_top_comments_for_post(str(pid), n=5)
    all_comments.extend(coms)

comments_df = pd.DataFrame(all_comments)

comments_df.head(5)


Unnamed: 0,post_id,body,score
0,1n5asa1,"Is it the third or the fourth ""two weeks"" ?",5124
1,1n5asa1,"""Day one"" is also a while back.",1427
2,1n5asa1,Gonna call Zelenskyy nasty for his ability to ...,1087
3,1n5asa1,“I’ll get that done within 24 hours. Everyone ...,529
4,1n5asa1,"Wow, not a thank you?? 🙄",404


In [11]:
comments_df.count()

post_id    25
body       25
score      25
dtype: int64

In [12]:
comments_df.to_csv("../output/reddit_comments.csv", index=False) # Lo mandamos a nuestra carpeta output

In [13]:
comments_df.head(25)

Unnamed: 0,post_id,body,score
0,1n5asa1,"Is it the third or the fourth ""two weeks"" ?",5124
1,1n5asa1,"""Day one"" is also a while back.",1427
2,1n5asa1,Gonna call Zelenskyy nasty for his ability to ...,1087
3,1n5asa1,“I’ll get that done within 24 hours. Everyone ...,529
4,1n5asa1,"Wow, not a thank you?? 🙄",404
5,1n53fzg,"As a reminder, this subreddit [is for civil di...",1
6,1n53fzg,Just in case any Americans are looking for rel...,3024
7,1n53fzg,I don’t trust any agency under Trump’s control...,635
8,1n53fzg,Everything's become an opinion piece from cons...,1381
9,1n53fzg,**Step 1:** Train your populace to not trust i...,275


In [14]:
posts_df = pd.read_csv("../output/Reddit.csv")

# Chequeo rápido: ¿todos los post_id de comments existen en posts?
missing = set(comments_df["post_id"]) - set(posts_df["id"])
print(f"Post_id sin match en posts_df: {len(missing)}")

comments_with_post = comments_df.merge(
    posts_df[["id", "subreddit", "title", "url"]],  # agregamos url
    left_on="post_id", right_on="id", how="left"
).drop(columns=["id"])

comments_with_post.head(25)


Post_id sin match en posts_df: 0


Unnamed: 0,post_id,body,score,subreddit,title,url
0,1n5asa1,"Is it the third or the fourth ""two weeks"" ?",5124,worldnews,Zelenskyy points out that Trump’s “two weeks” ...,https://www.pravda.com.ua/eng/news/2025/08/31/...
1,1n5asa1,"""Day one"" is also a while back.",1427,worldnews,Zelenskyy points out that Trump’s “two weeks” ...,https://www.pravda.com.ua/eng/news/2025/08/31/...
2,1n5asa1,Gonna call Zelenskyy nasty for his ability to ...,1087,worldnews,Zelenskyy points out that Trump’s “two weeks” ...,https://www.pravda.com.ua/eng/news/2025/08/31/...
3,1n5asa1,“I’ll get that done within 24 hours. Everyone ...,529,worldnews,Zelenskyy points out that Trump’s “two weeks” ...,https://www.pravda.com.ua/eng/news/2025/08/31/...
4,1n5asa1,"Wow, not a thank you?? 🙄",404,worldnews,Zelenskyy points out that Trump’s “two weeks” ...,https://www.pravda.com.ua/eng/news/2025/08/31/...
5,1n53fzg,"As a reminder, this subreddit [is for civil di...",1,politics,Fmr. CDC director says you can no longer trust...,https://www.msnbc.com/the-weekend-primetime/wa...
6,1n53fzg,Just in case any Americans are looking for rel...,3024,politics,Fmr. CDC director says you can no longer trust...,https://www.msnbc.com/the-weekend-primetime/wa...
7,1n53fzg,I don’t trust any agency under Trump’s control...,635,politics,Fmr. CDC director says you can no longer trust...,https://www.msnbc.com/the-weekend-primetime/wa...
8,1n53fzg,Everything's become an opinion piece from cons...,1381,politics,Fmr. CDC director says you can no longer trust...,https://www.msnbc.com/the-weekend-primetime/wa...
9,1n53fzg,**Step 1:** Train your populace to not trust i...,275,politics,Fmr. CDC director says you can no longer trust...,https://www.msnbc.com/the-weekend-primetime/wa...


In [15]:
comments_with_post.to_csv("../output/reddit_comments_with_post.csv", index=False) # Lo mandamos a nuestra carpeta output