**Cleaned and processed Bitcoin & Ethereum data on Reddit**

**praw** – Interacts with Reddit’s API to fetch posts and comments.
**logging** – Tracks errors, warnings, and events for debugging.
**dotenv** – Loads API keys securely from a .env file.

**nltk** – For text processing and NLP tasks where:
    stopwords – Removes common words like "the", "is", etc.
    word_tokenize – Splits text into words for analysis.

In [1]:
import praw
import pandas as pd
import logging
import os
import re
from datetime import datetime
from dotenv import load_dotenv

Tokens and stopwords needs to be downloaded
client id and secrect needs to be loaded from the enviroment

In [2]:
CUSTOM_STOPWORDS = {"the", "is", "and", "in", "to", "of", "it", "on", "this", "for", "with", "as", "was", "that", "by", "at"}


Logging configuration to set the level of logging. 

In [26]:
#nltk.data.path.append("C:/Users/Ajay/nltk_data")

In [3]:
load_dotenv()

True

In [4]:
# Simple logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s'
)

In [5]:
class RedditScraper:
    def __init__(self):
        self.reddit = praw.Reddit(
            client_id=os.getenv('REDDIT_CLIENT_ID'),
            client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
            user_agent=os.getenv('REDDIT_USER_AGENT')
        )
        logging.info("Connected to Reddit API")

    def get_posts(self, keyword, subreddit='cryptocurrency', limit=5):
        """Simple post retrieval with error handling"""
        try:
            posts = self.reddit.subreddit(subreddit).search(
                query=keyword, 
                limit=limit,
                params={'sort': 'new'}
            )
            
            results = []
            for post in posts:
                results.append({
                    'keyword': keyword,
                    'title': post.title,
                    'content': post.selftext,
                    'score': post.score,
                    'url': post.url,
                    'created_at': datetime.fromtimestamp(post.created_utc),
                })
            
            logging.info(f"Found {len(results)} posts for {keyword}")
            return results
            
        except Exception as e:
            logging.error(f"Error getting posts: {e}")
            return []

After Data Cleaning, tokenization, missing data it needs to be saved in a csv file

In [6]:
class DataManager:
    @staticmethod
    def clean_text(text):
        """Simplified but effective text cleaning pipeline"""
        # Handle missing data
        if not text or text.strip() == '':
            return 'no_content'
            
        # Removing URLs, HTML tags, and emojis
        text = re.sub(r'http\S+', '', text)          # URLs
        text = re.sub(r'<.*?>', '', text)            # HTML tags
        text = re.sub(r'[^\w\s]', '', text)          # Special characters/emojis
        
        # Tokenization and stopword removal
        #tokens = word_tokenize(text.lower())
        tokens = text.lower().split()  # Simple tokenization using space
        #stop_words = set(stopwords.words('english'))
        stop_words = CUSTOM_STOPWORDS
        tokens = [word for word in tokens if word not in stop_words]
        
        return ' '.join(tokens) if tokens else 'no_content'

    @staticmethod
    def save_data(data, name):
        """Save cleaned data to CSV"""
        if not data:
            logging.warning("No data to save")
            return
            
        df = pd.DataFrame(data)
        df['content'] = df['content'].apply(DataManager.clean_text)
        
        os.makedirs('cleaned_data', exist_ok=True)
        filename = f"cleaned_data/{name}_clean_{datetime.now().date()}.csv"
        df.to_csv(filename, index=False)
        logging.info(f"Saved cleaned data to {filename}")

In [7]:
if __name__ == "__main__":
    scraper = RedditScraper()
    
    # Get and clean Bitcoin posts
    bitcoin_posts = scraper.get_posts('Bitcoin', limit=5)
    DataManager.save_data(bitcoin_posts, 'bitcoin')
    
    # Get and clean Ethereum posts
    ethereum_posts = scraper.get_posts('Ethereum', limit=5)
    DataManager.save_data(ethereum_posts, 'ethereum')

2025-01-31 11:41:24,408 - Connected to Reddit API
2025-01-31 11:41:25,101 - Found 5 posts for Bitcoin
2025-01-31 11:41:25,117 - Saved cleaned data to cleaned_data/bitcoin_clean_2025-01-31.csv
2025-01-31 11:41:25,397 - Found 5 posts for Ethereum
2025-01-31 11:41:25,415 - Saved cleaned data to cleaned_data/ethereum_clean_2025-01-31.csv
