In [7]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
    --------------------------------------- 10.2/624.3 kB ? eta -:--:--
    --------------------------------------- 10.2/624.3 kB ? eta -:--:--
   - -------------------------------------- 20.5/624.3 kB 93.9 kB/s eta 0:00:07
   --- ----------------------------------- 61.4/624.3 kB 328.2 kB/s eta 0:00:02
   ---- ---------------------------------- 71.7/624.3 kB 302.7 kB/s eta 0:00:02
   ---- ---------------------------------- 71.7/624.3 kB 302.7 kB/s eta 0:00:02
   ---- ---------------------------------- 71.7/624.3 kB 302.7 kB/s eta 0:00:02
   ---- ---------------------------------- 71.7/624.3 kB 302.7 kB/s eta 0:00:02
   ------------ ------------------------- 204.8/624.3 kB 541.9 kB/s e

In [9]:
!python -m textblob.download_corpora

Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Guppa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Guppa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Guppa\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Guppa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\Guppa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Guppa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


In [3]:
import praw
import json
import time
import random
import os
from datetime import datetime, timezone
from textblob import TextBlob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

#Function for Sentiment Analysis
def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        return "positive"
    elif polarity < -0.1:
        return "negative"
    else:
        return "neutral"

#Scrapper
class RedditScraper:
    def __init__(self, client_id, client_secret, user_agent, subreddits, post_limit=500, chunk_size=100, include_comments=True):
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )
        self.subreddits = subreddits
        self.post_limit = post_limit
        self.chunk_size = chunk_size
        self.include_comments = include_comments
        self.output_dir = "reddit_scrape_chunks"
        os.makedirs(self.output_dir, exist_ok=True)
        self.all_data = []
        self.file_index = 1

    def scrape(self):
        for sub in self.subreddits:
            print(f" Scraping r/{sub}...")
            try:
                for count, post in enumerate(self.reddit.subreddit(sub).top(limit=self.post_limit), 1):
                    try:
                        post_data = self._extract_post_data(post, sub)
                        self.all_data.append(post_data)

                        if count % self.chunk_size == 0:
                            self._save_chunk(sub)
                            self.all_data = []
                            self.file_index += 1

                        time.sleep(0.5)

                    except Exception as post_err:
                        print(f" Error on post {post.id}: {post_err}")
                        time.sleep(2)

                sleep_time = random.uniform(6, 12)
                print(f" Sleeping {sleep_time:.2f}s to avoid rate limit...")
                time.sleep(sleep_time)

            except Exception as e:
                print(f" Error in r/{sub}: {e}")
                if "429" in str(e):
                    print(" Hit rate limit. Sleeping for 60s...")
                    time.sleep(60)
                else:
                    time.sleep(5)

        if self.all_data:
            self._save_chunk(sub)

    def _extract_post_data(self, post, subreddit):
        post_data = {
            'subreddit': subreddit,
            'title': post.title,
            'id': post.id,
            'url': post.url,
            'score': post.score,
            'text': post.selftext,
            'created_utc': post.created_utc,
            'created_at': datetime.fromtimestamp(post.created_utc, timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
            'num_comments': post.num_comments,
            'comments': []
        }

        if self.include_comments:
            try:
                post.comments.replace_more(limit=0)
                post_data['comments'] = [{
                    'id': comment.id,
                    'author': str(comment.author),
                    'body': comment.body,
                    'score': comment.score,
                    'created_utc': comment.created_utc,
                    'created_at': datetime.fromtimestamp(comment.created_utc, timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
                    'sentiment': get_sentiment(comment.body)
                } for comment in post.comments.list()]

            except Exception as e:
                print(f" Error fetching comments for post {post.id}: {e}")
                time.sleep(2)

        return post_data

    def _save_chunk(self, subreddit):
        file_path = os.path.join(self.output_dir, f"{subreddit}_batch_{self.file_index}.json")
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(self.all_data, f, indent=2)
        print(f" Saved {len(self.all_data)} posts to {file_path}")

#Common points function 
class RedditCommentClusterer:
    def __init__(self, json_folder='reddit_scrape_chunks', n_clusters=5):
        self.json_folder = json_folder
        self.n_clusters = n_clusters
        self.comments = []
        self.vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        self.labels = []

    def load_comments(self):
        print("Loading comments...")
        for filename in os.listdir(self.json_folder):
            if filename.endswith(".json"):
                with open(os.path.join(self.json_folder, filename), "r", encoding="utf-8") as f:
                    posts = json.load(f)
                    for post in posts:
                        for comment in post.get("comments", []):
                            text = comment.get("body", "").strip()
                            if text:
                                self.comments.append(text)
        print(f" Loaded {len(self.comments)} comments.")

    def vectorize_comments(self):
        print(" Vectorizing comments...")
        return self.vectorizer.fit_transform(self.comments)

    def perform_clustering(self, X):
        print(" Performing KMeans clustering...")
        self.kmeans.fit(X)
        self.labels = self.kmeans.labels_

    def get_top_words_per_cluster(self, n_words=10):
        print(" Extracting top words per cluster...")
        terms = self.vectorizer.get_feature_names_out()
        order_centroids = self.kmeans.cluster_centers_.argsort()[:, ::-1]
        clusters = {}
        for i in range(self.n_clusters):
            top_words = [terms[ind] for ind in order_centroids[i, :n_words]]
            clusters[i] = top_words
        return clusters

    def get_sample_comments_per_cluster(self, samples_per_cluster=3):
        print(" Collecting sample comments...")
        clustered_comments = {i: [] for i in range(self.n_clusters)}
        for idx, label in enumerate(self.labels):
            if len(clustered_comments[label]) < samples_per_cluster:
                clustered_comments[label].append(self.comments[idx])
        return clustered_comments

    def run(self):
        self.load_comments()
        X = self.vectorize_comments()
        self.perform_clustering(X)
        top_words = self.get_top_words_per_cluster()
        sample_comments = self.get_sample_comments_per_cluster()

        # Display results as DataFrame
        df = pd.DataFrame({
            "Top Words Per Cluster": [', '.join(top_words[i]) for i in range(self.n_clusters)],
            "Sample Comments": ['\n\n'.join(sample_comments[i]) for i in range(self.n_clusters)]
        })
        return df


clusterer = RedditCommentClusterer(json_folder="reddit_scrape_chunks", n_clusters=5)
result_df = clusterer.run()

# Display result 
import IPython.display as display
display.display(result_df)



scraper = RedditScraper(
    client_id='94-KDboZSbfIo3SK3FDSzg',
    client_secret='6ktVmT5--Uj7CeubVKIJJCII2tFsEQ',
    user_agent='DataScrapper1811',
    subreddits=[
        'CryptoCurrency', 'CryptoMarkets',
        'Bitcoin', 'BitcoinBeginners', 'btc',
        'ethtrader', 'ethereum',
        'XRP', 'Ripple',
        'binance',
        'solana',
        'Tronix',
        'dogecoin',
        'cardano'
    ],
    post_limit=500,
    chunk_size=100,
    include_comments=True
)

scraper.scrape()


Loading comments...
 Loaded 912467 comments.
 Vectorizing comments...
 Performing KMeans clustering...
 Extracting top words per cluster...
 Collecting sample comments...


Unnamed: 0,Top Words Per Cluster,Sample Comments
0,"bitcoin, cash, https, people, just, buy, like,...",This subreddit is not a place where companies ...
1,"way, buy, lol, think, like, sell, good, better...",You don’t. You HODL. BTC is worth way more.\n\...
2,"just, like, crypto, people, lol, buy, good, mo...",----------------------------------\n\n**Some f...
3,"don, know, thanks, people, just, think, like, ...","Once in a lifetime, you hear a story. You just..."
4,"deleted, removed, post, comment, did, account,...",[removed]\n\n[removed]\n\n[removed]


 Scraping r/CryptoCurrency...
 Saved 100 posts to reddit_scrape_chunks\CryptoCurrency_batch_1.json
 Saved 100 posts to reddit_scrape_chunks\CryptoCurrency_batch_2.json
 Saved 100 posts to reddit_scrape_chunks\CryptoCurrency_batch_3.json
 Saved 100 posts to reddit_scrape_chunks\CryptoCurrency_batch_4.json
 Saved 100 posts to reddit_scrape_chunks\CryptoCurrency_batch_5.json
 Sleeping 6.16s to avoid rate limit...
 Scraping r/CryptoMarkets...
 Saved 100 posts to reddit_scrape_chunks\CryptoMarkets_batch_6.json
 Saved 100 posts to reddit_scrape_chunks\CryptoMarkets_batch_7.json
 Saved 100 posts to reddit_scrape_chunks\CryptoMarkets_batch_8.json
 Saved 100 posts to reddit_scrape_chunks\CryptoMarkets_batch_9.json
 Saved 100 posts to reddit_scrape_chunks\CryptoMarkets_batch_10.json
 Sleeping 10.22s to avoid rate limit...
 Scraping r/Bitcoin...
 Saved 100 posts to reddit_scrape_chunks\Bitcoin_batch_11.json
 Saved 100 posts to reddit_scrape_chunks\Bitcoin_batch_12.json
 Saved 100 posts to reddit_