# Extract data from subreddits 

### Imports

In [161]:
import praw 
import pandas as pd
import matplotlib as m
from dotenv import load_dotenv
from pathlib import Path
import os

### Loading creds from .env

In [162]:
load_dotenv()

client_id = os.getenv("REDDIT_CLIENT_ID")
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
user_agent = os.getenv("REDDIT_USER_AGENT")

### Connecting to Reddit API

In [163]:
reddit = praw.Reddit(
    client_id = client_id,
    client_secret = client_secret,
    user_agent = user_agent
)

## Testing connection
subreddit = reddit.subreddit("dataengineering")



# print(f"Subreddit Title: {subreddit.title}")
# print(f"Subreddit Description: {subreddit.description}")
# print(f"Subreddit Subscribers: {subreddit.subscribers}")

### Ingest Data from top 15 posts for this week

In [None]:
posts = subreddit.top(time_filter = "week", limit=15)

data = []
removed_data = []
meme_data = []

# Only insert top 15 posts which have more than 20 upvotes and more than 10 comments. Also remove any posts with the flair "meme".
for post in posts:
    if post.score < 20 or post.num_comments < 10:
        removed_data.append({"title":post.title ,"score": post.score, "num_comments": post.num_comments})
        continue
    elif post.link_flair_text and post.link_flair_text == "Meme":
        meme_data.append({"title":post.title ,"score": post.score, "num_comments": post.num_comments})
        continue
    else:
        data.append({
            "title": post.title,
            "score": post.score,
            "url": post.url,
            "num_comments": post.num_comments,
            "created_utc": post.created_utc
        })

# Print the data
df = pd.DataFrame(data)
df_delete = pd.DataFrame(removed_data)
df_meme = pd.DataFrame(meme_data)
print("Number of titles saved:", df['title'].count())
print("\n------------------------")
print("Removed following posts:")
print("\n", df_delete)
print("\n------------------------")
print("Removed following memes:")
print("\n", df_meme)

### Convert to .csv format and save to Bronze Layer

In [165]:
BASE_DIR = Path().resolve().parent  

output_path = BASE_DIR / 'data' / 'bronze' / 'raw_reddit_posts.csv'

df.to_csv(output_path, index=False)