In [28]:
import os
import time
from datetime import datetime

import pandas as pd
import praw
from CONFIG import SUBMISSION_DATA_FIELDS
from dotenv import find_dotenv, load_dotenv

In [29]:
# Load environment variables
load_dotenv(find_dotenv())

# Fetch Reddit API credentials from the environment
CLIENT_ID = os.getenv("CLIENT_ID")
SECRET_TOKEN = os.getenv("SECRET_TOKEN")
USERNAME = os.getenv("USERNAME")
PASSWORD = os.getenv("PASSWORD")

# Set up the Reddit API client with PRAW
reddit = praw.Reddit(client_id=CLIENT_ID,
                     client_secret=SECRET_TOKEN,
                     password=PASSWORD,
                     username=USERNAME,
                     user_agent='Tutorial')

# Print the authenticated user to confirm successful connection
print(reddit.user.me())

andreaLolli


In [30]:
def search_and_list_subreddits(query, limit=10):
    """
    Performs a search across all subreddits for a specific query and lists unique subreddits related to the query.

    Args:
    query (str): Search keyword.
    limit (int): Maximum number of results to return.

    Returns:
    set: Unique subreddit names from the search results.
    """
    # Conduct search and collect subreddit names from results
    search_results = reddit.subreddit('all').search(query, limit=limit)
    return {result.subreddit.display_name for result in search_results}

In [31]:
# Search for subreddits related to "Drake OR Kendrick Lamar" and limit the search to 50 results
subreddits_names = search_and_list_subreddits(query="Drake OR Kendrick Lamar", limit=50)

# Display the unique names of subreddits found
subreddits_names

{'BlackPeopleTwitter',
 'Destiny',
 'Drizzy',
 'Fauxmoi',
 'Hiphopcirclejerk',
 'KendrickLamar',
 'Music',
 'NonPoliticalTwitter',
 'TeamSESH',
 'TwoBestFriendsPlay',
 'conspiracy',
 'entertainment',
 'h3h3productions',
 'hiphop101',
 'hiphopheads',
 'interestingasfuck',
 'playboicarti',
 'popculturechat',
 'popheads',
 'rap',
 'tumblr',
 'ufc'}

In [32]:
def fetch_submission_data(subreddit_name, keywords, start_timestamp, fields=SUBMISSION_DATA_FIELDS, limit=100):
    """
    Fetches submission data from a given subreddit based on search keywords.
    Filters submissions based on their creation time to include only those newer than a given timestamp.
    Retrieves specified data fields for each qualifying submission.

    Args:
        subreddit_name (str): The name of the subreddit to search in.
        keywords (str): The search keywords used to find relevant submissions.
        start_timestamp (int): Unix timestamp; submissions created before this time are ignored.
        fields (list): List of field names to extract data from each submission.
        limit (int): The maximum number of search results to fetch.

    Returns:
        list: A list of dictionaries, each containing data from a submission that meets the search criteria.
    """
    submissions_data = []  # Initialize a list to store data from each submission.

    # Perform a search within the specified subreddit using the provided keywords and limit.
    for submission in reddit.subreddit(subreddit_name).search(keywords, limit=limit):
        submission_data = {}  # Initialize a dictionary to store data from the current submission.
        
        # Filter submissions to only include those newer than the provided start_timestamp.
        if submission.created_utc > start_timestamp:
            # Loop over each field required to be fetched from the submission.
            for field in fields:
                # Safely fetch the attribute; if it's missing, default to an empty string.
                try:
                    submission_data[field] = getattr(submission, field, '')
                except AttributeError:
                    submission_data[field] = ''

            # Add the dictionary containing the fetched data to the list.
            submissions_data.append(submission_data)

    # Return the list of dictionaries containing the data from each relevant submission.
    return submissions_data

In [33]:
# Prepare to collect data on posts.
submissions_data_raw = []

# Define the start date from which to fetch posts.
start_date = "2024-03-22"
# Convert the start date string into a time structure.
time_struct = time.strptime(start_date, "%Y-%m-%d")
# Convert the time structure into a UNIX timestamp.
start_timestamp = int(time.mktime(time_struct))

# Iterate over each unique subreddit name obtained from previous searches.
for subreddit_name in set(subreddits_names):
    print(f"Fetching {subreddit_name} ...")  # Informative print statement indicating current subreddit being processed.
    
    # Fetch submission data from the current subreddit, filtered by keywords and date.
    data_raw = fetch_submission_data(
        subreddit_name=subreddit_name,
        start_timestamp=start_timestamp,
        keywords="Kendrick Lamar OR Drake",
        limit=10000  # Attempt to fetch up to 10,000 submissions.
    )
    
    # Extend the main list with the data fetched from the current subreddit.
    submissions_data_raw.extend(data_raw)

Fetching Hiphopcirclejerk ...
Fetching Music ...
Fetching BlackPeopleTwitter ...
Fetching h3h3productions ...
Fetching NonPoliticalTwitter ...
Fetching Destiny ...
Fetching TeamSESH ...
Fetching popheads ...
Fetching conspiracy ...
Fetching popculturechat ...
Fetching tumblr ...
Fetching entertainment ...
Fetching hiphopheads ...
Fetching KendrickLamar ...
Fetching interestingasfuck ...
Fetching Drizzy ...
Fetching ufc ...
Fetching rap ...
Fetching TwoBestFriendsPlay ...
Fetching Fauxmoi ...
Fetching hiphop101 ...
Fetching playboicarti ...


Unnamed: 0,author,author_flair_text,clicked,comments,created_utc,distinguished,edited,id,is_original_content,is_self,...,poll_data,saved,score,selftext,spoiler,stickied,subreddit,title,upvote_ratio,url
0,Me_Zebra,,False,"(l4lm3nd, l4l61ze, l4liyn5, l4limap, l4lmcuz, ...",1716032000.0,,False,1cuvdhs,False,False,...,,False,2056,"What’s up Reddit, it’s Kendrick Lamar.\n\nI ha...",False,False,Hiphopcirclejerk,"I am Kendrick Lamar, AMA",0.99,https://i.redd.it/wmwtpt51961d1.jpeg
1,Pakiman1432,,False,"(l3sn4s4, l3svfm3, l3shfhf, l3sw5rn, l3threp, ...",1715561000.0,,False,1cqmjpk,False,False,...,,False,967,,False,False,Hiphopcirclejerk,Kendrick lamar sub is an underrated goldmine,0.98,https://i.redd.it/gvwlwq56b30d1.jpeg
2,kanyetookthekids,,False,"(l222zqx, l22xj2q, l22glgb, l226sh4, l22ae1e, ...",1714542000.0,,False,1chehpz,False,False,...,,False,1967,,False,False,Hiphopcirclejerk,Kendrick Lamar goes post-rock????,1.0,https://i.redd.it/ea4fb2b16rxc1.jpeg
3,puerdestellae,,False,"(l3w7gw2, l3wp6z6, l3wer2s, l3wuujy, l3wu6c9, ...",1715628000.0,,False,1cr7qde,False,False,...,,False,2430,,False,False,Hiphopcirclejerk,Why hasn't Drake addressed Kendrick's Prematur...,0.99,https://i.redd.it/iug4kuk9u80d1.png
4,MxCxVA,bought streams for French Montana,False,"(l3rl751, l3roi45, l3rwvt3, l3s007a, l3roy1z, ...",1715550000.0,,1715577744.0,1cqioo2,False,True,...,,False,1411,I have been laying down in bed for the last tw...,False,False,Hiphopcirclejerk,Drake won the beef,0.85,https://www.reddit.com/r/Hiphopcirclejerk/comm...


In [None]:
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(submissions_data_raw)

# Display the first few rows of the DataFrame to verify its structure and content
print(df.head())

# Print the shape of the DataFrame to see the number of rows and columns
print(df.shape)

# Ensure the target directory exists and save the DataFrame to a CSV file
os.makedirs('data', exist_ok=True)  # This will create the directory if it does not exist, avoiding FileNotFoundError
df.to_csv("data/rap_beef.csv", index=False) 