In [50]:
import requests
import json
from bs4 import BeautifulSoup
from datetime import datetime
from transformers import pipeline
import matplotlib.pyplot as plt

In [51]:
def scrape_reddit_topics(search_query):
    # Use search endpoint with query
    url = f"https://www.reddit.com/search.json?q={search_query}&sort=relevance"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.3'
    }
    try:
        # Get search results
        response = requests.get(url, headers=headers)
        data = response.json()
        
        posts_with_comments = []
        
        for post in data['data']['children']:
            post_data = post['data']
            
            # Get comments for this post
            comments_url = f"https://www.reddit.com{post_data['permalink']}.json"
            comments_response = requests.get(comments_url, headers=headers)
            comments_data = comments_response.json()
            
            # Extract comments
            comments = []
            if len(comments_data) > 1:  # Check if there are comments
                for comment in comments_data[1]['data']['children']:
                    if 'body' in comment['data']:
                        comments.append({
                            'author': comment['data'].get('author', '[deleted]'),
                            'body': comment['data']['body'],
                            'score': comment['data'].get('score', 0),
                            'created_utc': datetime.fromtimestamp(comment['data']['created_utc']).isoformat()
                        })
            
            # Combine post and comments data
            posts_with_comments.append({
                'title': post_data['title'],
                'author': post_data['author'],
                'score': post_data['score'],
                'url': post_data['url'],
                'created_utc': datetime.fromtimestamp(post_data['created_utc']).isoformat(),
                'num_comments': post_data['num_comments'],
                'selftext': post_data.get('selftext', ''),
                'comments': comments
            })
        
        # Save to JSON file
            
        return posts_with_comments
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return None

In [52]:
#webscraping particular subreddits
print("Hi! Welcome to the Reddit Web Scraper!")
subreddit = input("Please enter the subreddit you would like to scrape: ")
posts = scrape_reddit_topics(subreddit)

Hi! Welcome to the Reddit Web Scraper!


In [53]:
#Scraped data is in JSON format


In [54]:
def extract_posts_and_comments(json_obj):
    texts = []
    for entry in json_obj:
        # Extract post content (selftext)
        if 'selftext' in entry and entry['selftext']:
            texts.append(entry['selftext'])
        
        # Extract comments
        if 'comments' in entry:
            for comment in entry['comments']:
                if 'body' in comment:
                    texts.append(comment['body'])
    return texts

posts_and_comments = extract_posts_and_comments(posts)



In [None]:
#NLP stuff 
classifier = pipeline('sentiment-analysis', model='allenai/longformer-base-4096')

results=[]
for text in posts_and_comments:
    result = classifier(text)
    results.append(result)

#label 0: negative, label 1: positive
negative_count = 0
positive_count = 0
for count in results:
    if count[0]['label'] == 'LABEL_1':
        negative_count += 1
    elif count[0]['label'] == 'LABEL_0':
        positive_count += 1

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFLongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use 0


In [None]:
#visualizing the NLP data
labels = ['Positive', 'Negative']
sizes = [positive_count, negative_count]
colors = ['gold', 'yellowgreen']
explode = (0.1, 0) 
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
