In [1]:
import os
import subprocess
import time
import threading
import re
import praw
import nltk
from nltk.corpus import words
import sys, findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp

In [2]:
# ---------------- NLTK SETUP ----------------
nltk.download('words')
english_words = set(w.lower() for w in words.words())

[nltk_data] Downloading package words to C:\Users\Adarsh
[nltk_data]     Ranjan\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
# ---------------- PATHS ----------------
# Local directories
raw_batches = r"E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\raw_batches"
filtered_batches = r"E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\filtered_batches"
os.makedirs(raw_batches, exist_ok=True)
os.makedirs(filtered_batches, exist_ok=True)

# HDFS directories
hdfs_uri = "hdfs://localhost:9000"
hdfs_raw_dir = f"{hdfs_uri}/user/adarsh/realtime_pipeline/raw_batches"
hdfs_filtered_dir = f"{hdfs_uri}/user/adarsh/realtime_pipeline/filtered_batches"

HDFS_CMD = r"E:\hadoop\bin\hdfs.cmd"

# Spark checkpoint
checkpoint_dir = r"E:\Coding\BDA-PySpark\realtime-pipeline\checkpoints"

os.environ['HADOOP_USER_NAME'] = 'AdarshRanjan'

In [4]:
# ---------------- REDDIT API ----------------
reddit = praw.Reddit(
    client_id = "ID",
    client_secret = "secret", 
    user_agent="LiveRedditStream/1.0 by /u/bda_pipeline"
)
print(" Reddit API connected.")



 Reddit API connected.


In [5]:
import shutil

# ---------------- Directories ----------------

log = r"E:\Coding\BDA-PySpark\realtime-pipeline\logs"

# Assume you already have these variables defined:
# raw_batches, filtered_batches, hdfs_raw_dir, hdfs_filtered_dir, HDFS_CMD

# ---------------- CLEANUP AND PLACEHOLDERS ----------------
confirm = input("This will delete local batch files, logs, checkpoints, and HDFS data. Continue? (y/n): ")

for local_dir, hdfs_dir in [(raw_batches, hdfs_raw_dir), (filtered_batches, hdfs_filtered_dir)]:
    # ---------------- Local batch cleanup ----------------
    if confirm.lower() == 'y' and os.path.exists(local_dir):
        deleted_files = 0
        for f in os.listdir(local_dir):
            if (f.startswith("r_batch") or f.startswith("f_batch")) and f.endswith(".txt"):
                os.remove(os.path.join(local_dir, f))
                deleted_files += 1
        print(f"[LOCAL] Deleted {deleted_files} old batch files in {local_dir}")
    else:
        print(f"[LOCAL] Skipped deletion of local files in {local_dir}")

    # ---------------- HDFS cleanup ----------------
    if confirm.lower() == 'y':
        try:
            subprocess.run([HDFS_CMD, "dfs", "-rm", "-r", "-skipTrash", hdfs_dir],
                           check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            print(f"[HDFS] Deleted existing HDFS folder {hdfs_dir}")
        except subprocess.CalledProcessError:
            print(f"[HDFS] HDFS folder {hdfs_dir} did not exist or could not be deleted")
    else:
        print(f"[HDFS] Skipped deletion of HDFS folder {hdfs_dir}")

    # ---------------- Ensure local folder exists ----------------
    os.makedirs(local_dir, exist_ok=True)
    print(f"[LOCAL] Ensured local folder exists: {local_dir}")

    # ---------------- Create placeholder file ----------------
    placeholder_file = os.path.join(local_dir, "Placeholder.txt")
    with open(placeholder_file, "w", encoding="utf-8") as f:
        f.write("placeholder\n")
    print(f"[LOCAL] Created placeholder file: {placeholder_file}")

    # ---------------- Ensure HDFS folder exists ----------------
    subprocess.run([HDFS_CMD, "dfs", "-mkdir", "-p", hdfs_dir], check=True)
    print(f"[HDFS] Ensured HDFS folder exists: {hdfs_dir}")

    # ---------------- Upload placeholder to HDFS ----------------
    subprocess.run([HDFS_CMD, "dfs", "-put", "-f", placeholder_file, hdfs_dir], check=True)
    print(f"[HDFS] Uploaded placeholder file to HDFS: {hdfs_dir}")

# ---------------- Clean up logs ----------------
if confirm.lower() == 'y':
    if os.path.exists(log):
        try:
            shutil.rmtree(log)
            print(f"[LOCAL] Deleted entire logs directory: {log}")
            
            # Recreate empty directory
            os.makedirs(log, exist_ok=True)
            print(f"[LOCAL] Recreated empty logs directory: {log}")
            
        except Exception as e:
            print(f"[LOCAL] Failed to delete logs directory: {e}")
    else:
        print(f"[LOCAL] Logs directory doesn't exist: {log}")
else:
    print(f"[LOCAL] Skipped deletion (no recreation)")

print("\n Cleanup and placeholder setup complete.")

[LOCAL] Deleted 5 old batch files in E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\raw_batches
[HDFS] Deleted existing HDFS folder hdfs://localhost:9000/user/adarsh/realtime_pipeline/raw_batches
[LOCAL] Ensured local folder exists: E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\raw_batches
[LOCAL] Created placeholder file: E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\raw_batches\Placeholder.txt
[HDFS] Ensured HDFS folder exists: hdfs://localhost:9000/user/adarsh/realtime_pipeline/raw_batches
[HDFS] Uploaded placeholder file to HDFS: hdfs://localhost:9000/user/adarsh/realtime_pipeline/raw_batches
[LOCAL] Deleted 5 old batch files in E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\filtered_batches
[HDFS] Deleted existing HDFS folder hdfs://localhost:9000/user/adarsh/realtime_pipeline/filtered_batches
[LOCAL] Ensured local folder exists: E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\filtered_batches
[LOCAL] Created placeholder file: E:\

In [6]:
# ---------------- Reddit Emoji Dictionary ----------------
emoji_dict = {
    # Joy / happiness
    "😂": "joy", "🤣": "laugh", "😊": "happy", "😄": "smile", "😁": "grin", "😆": "laugh",
    "😃": "smile", "😎": "cool", "👍": "like", "❤️": "love", "💖": "love", "💯": "great",

    # Sadness / disappointment
    "😢": "sad", "😭": "cry", "🙁": "sad", "😔": "disappointed", "😟": "worried", "😞": "sad",
    "😩": "tired", "😫": "tired",

    # Anger / frustration
    "😡": "angry", "😠": "angry", "👿": "angry", "💀": "death", "🔥": "fire", "⚡": "shock",

    # Love / admiration
    "😍": "love", "😘": "kiss", "🥰": "love", "💘": "love", "💓": "love", "💝": "love",

    # Surprise / excitement
    "😲": "surprise", "😮": "surprise", "😳": "shocked", "😱": "shocked", "🤯": "mindblown",

    # Sleep / relaxation
    "😴": "sleepy", "💤": "sleep", "😌": "relieved", "😪": "sleepy", "😇": "innocent",

    # Misc positive
    "👏": "clap", "🤝": "handshake", "✨": "sparkle", "🌟": "star", "🎉": "celebration", "🎊": "celebration",

    # Misc negative
    "👎": "dislike", "💔": "heartbroken", "☹️": "sad", "🤢": "disgust", "🤮": "disgust",

    # Misc reactions
    "🤔": "thinking", "🙄": "eyeroll", "🤷": "shrug", "😐": "neutral", "😶": "silent", "😬": "nervous"
}

def replace_emojis(text):
    for emoji, meaning in emoji_dict.items():
        text = text.replace(emoji, f" {meaning} ")
    return text


In [7]:
# ---------------- UTILITY FUNCTIONS ----------------
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()

# ✅ REDUCED stopwords list - keep more meaningful words for topic modeling
stop_words = {
    "a", "an", "the", "and", "or", "is", "are", "to", "of", "in", "that", "this",
    "it", "on", "for", "with", "as", "was", "at", "by", "be", "from", "has", "have",
    "u", "im", "yeah", "oh"
}

def clean_raw_text(text, min_words=50, chunk_size=150):
    """
    Clean text for raw batches (topic modeling):
    - Produce multiple paragraphs from each comment for better topic coverage
    - Minimal stopwords removal
    - Lemmatization
    - Keep punctuation (.!?) and sentence structure
    """
    
    # ✅ DON'T lowercase yet - preserve proper nouns
    original_text = text
    
    # Replace emojis
    text = replace_emojis(text)
    
    # Remove URLs, mentions/hashtags
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[@#]\w+", "", text)
    
    # ✅ REMOVED: Don't remove repeated chars (important for emphasis)
    # ✅ REMOVED: Don't remove non-ASCII (keeps accents, special chars)
    
    # ✅ NOW lowercase after preserving structure
    text = text.lower()
    
    # ✅ Keep more punctuation for sentence structure
    text = re.sub(r"[^a-z\s.,!?'-]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    
    # Fix broken apostrophes
    text = re.sub(r"(\w)'m(\w)", r"\1m\2", text)  # ki'm → kim
    text = re.sub(r"(\w)'(\w)", r"\1\2", text)    # dri've → drive

    
    # ✅ Fix contractions for better readability
    text = text.replace("dont", "don't")
    text = text.replace("cant", "can't")
    text = text.replace("wont", "won't")
    text = text.replace("didnt", "didn't")
    text = text.replace("doesnt", "doesn't")
    text = text.replace("im", "i'm")
    text = text.replace("ive", "i've")
    text = text.replace("youre", "you're")
    text = text.replace("theyre", "they're")
    text = text.replace("thats", "that's")
    
    # ✅ CHANGED: Don't remove stopwords and lemmatize immediately
    # Keep words as-is for better topic modeling
    words_list = [
        w for w in text.split()
        if len(w) > 1  # Only remove single chars
    ]
    
    # If too short, return None (don't keep garbage)
    if len(words_list) < min_words:
        return None  # ✅ Changed from returning list to None
    
    # ✅ CHANGED: Keep as ONE document, don't split into chunks
    # Topic modeling works better with complete documents
    return [" ".join(words_list)]


# ---------------- UTILITY FUNCTIONS ----------------
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) | {
    "u", "im", "dont", "cant", "didnt", "ive", "yeah", "oh", "like", "know",
    "a", "an", "the", "and", "or", "is", "are", "to", "of", "in", "that", "this",
    "it", "on", "for", "with", "as", "was", "at", "by", "be", "from", "has", "have"
}

def clean_raw_text(text, min_words=50, chunk_size=150):
    """
    Clean text for raw batches (topic modeling):
    - Produce multiple paragraphs from each comment for better topic coverage
    - Minimal stopwords removal
    - Lemmatization
    - Keep punctuation (.!?)
    """

    # Lowercase
    text = text.lower()

    # Replace emojis
    text = replace_emojis(text)

    # Remove URLs, mentions/hashtags, repeated chars, non-ASCII
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[@#]\w+", "", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    text = re.sub(r"[^\x00-\x7F]+", "", text)

    # Keep punctuation . , ! ? 
    text = re.sub(r"[^a-z\s.,!?]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize, remove minimal stopwords, lemmatize
    words_list = [
        lemmatizer.lemmatize(w)
        for w in text.split()
        if w not in stop_words and w in english_words
    ]

    # If too short, keep as one paragraph
    if len(words_list) < min_words:
        return [" ".join(words_list)]

    # Split into multiple paragraphs of chunk_size words
    paragraphs = [
        " ".join(words_list[i:i+chunk_size])
        for i in range(0, len(words_list), chunk_size)
    ]
    return paragraphs


    

In [8]:
def clean_filtered_text(text, min_words=30):
    """
    Minimal cleaning for sentiment analysis:
    - Keep emotion words and negations (CRITICAL!)
    - Keep punctuation (! ? . , for sentiment)
    - Fix contractions for readability
    - NO stopword removal (they carry sentiment!)
    - Keep as ONE complete comment (don't split)
    """
    
    # Basic cleanup
    text = text.strip()
    
    # Replace emojis with text
    text = replace_emojis(text)
    
    # Remove URLs and Reddit patterns
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"/[ru]/\w+", "", text)
    text = re.sub(r"[@#]\w+", "", text)
    
    # Remove excessive repeated characters (loooove -> loove)
    text = re.sub(r"(.)\1{3,}", r"\1\1", text)  # Changed from 2 to 3
    
    # ✅ FIX CONTRACTIONS (IMPORTANT for sentiment!)
    text = text.replace("dont", "don't")
    text = text.replace("cant", "can't")
    text = text.replace("wont", "won't")
    text = text.replace("didnt", "didn't")
    text = text.replace("doesnt", "doesn't")
    text = text.replace("isnt", "isn't")
    text = text.replace("arent", "aren't")
    text = text.replace("wasnt", "wasn't")
    text = text.replace("werent", "weren't")
    text = text.replace("hasnt", "hasn't")
    text = text.replace("havent", "haven't")
    text = text.replace("hadnt", "hadn't")
    text = text.replace("wouldnt", "wouldn't")
    text = text.replace("shouldnt", "shouldn't")
    text = text.replace("couldnt", "couldn't")
    text = text.replace("im", "i'm")
    text = text.replace("ive", "i've")
    text = text.replace("youre", "you're")
    text = text.replace("youve", "you've")
    text = text.replace("theyre", "they're")
    text = text.replace("theyve", "they've")
    text = text.replace("its", "it's")
    text = text.replace("thats", "that's")
    text = text.replace("whats", "what's")
    text = text.replace("heres", "here's")
    text = text.replace("theres", "there's")
    
    # ✅ Fix broken apostrophes from your data
    text = re.sub(r"(\w)'(\w)", r"\1\2", text)  # cli'mb → climb
    
    # ✅ KEEP PUNCTUATION - only remove extreme special chars
    text = re.sub(r"[^\w\s.,!?'-]", "", text)
    
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    
    # ✅ DON'T remove stopwords - they're important for sentiment!
    # ✅ DON'T split into chunks - keep complete comment
    
    # Check minimum length
    word_count = len(text.split())
    if word_count < min_words:
        return None  # Skip very short comments
    
    # Return as single document
    return [text]


In [9]:
def get_next_batch_number(folder, prefix="batch", suffix=".txt"):
    """Get the next batch number dynamically based on existing files."""
    existing = [f for f in os.listdir(folder) if f.startswith(prefix) and f.endswith(suffix)]
    if not existing:
        return 1
    numbers = []
    for f in existing:
        try:
            num = int(re.findall(r'\d+', f)[0])
            numbers.append(num)
        except:
            continue
    return max(numbers) + 1


In [10]:
from datetime import datetime
import time
import threading
import subprocess
import os
import pandas as pd

# ---------------- LOG FUNCTION ----------------
def append_log_to_csv(log_entry):
    """Append single log entry to CSV"""
    log_csv_path = r"E:\Coding\BDA-PySpark\realtime-pipeline\logs\upload_logs.csv"
    
    os.makedirs(os.path.dirname(log_csv_path), exist_ok=True)
    
    if not os.path.exists(log_csv_path):
        df = pd.DataFrame([log_entry])
        df.to_csv(log_csv_path, index=False)
    else:
        df = pd.DataFrame([log_entry])
        df.to_csv(log_csv_path, mode='a', header=False, index=False)

# ---------------- REDDIT STREAMING ----------------
buffer_raw, buffer_filtered = [], []
last_write_time_raw, last_write_time_filtered = time.time(), time.time()
batch_count_raw = get_next_batch_number(raw_batches, prefix="r_batch")
batch_count_filtered = get_next_batch_number(filtered_batches, prefix="f_batch")

def stream_reddit_comments():
    global buffer_raw, buffer_filtered
    global last_write_time_raw, last_write_time_filtered
    global batch_count_raw, batch_count_filtered

    while True:
        try:
            for comment in reddit.subreddit("all").stream.comments(skip_existing=True):
                text = comment.body.replace("\n", " ").strip()
                
                lines_raw = clean_raw_text(text)
                lines_filtered = clean_filtered_text(text)

                if lines_raw:
                    buffer_raw.extend(lines_raw)
                if lines_filtered:
                    buffer_filtered.extend(lines_filtered)

                current_time = time.time()

                # ---------------- Raw batch writing ----------------
                if len(buffer_raw) >= 50 or (current_time - last_write_time_raw >= 90):
                    if buffer_raw:
                        # Start timing
                        start_time = time.time()
                        
                        filename = f"r_batch{batch_count_raw}.txt"
                        local_path = os.path.join(raw_batches, filename)
                        
                        # ✅ WRITE FILE FIRST
                        # In your batch writing:
                        with open(local_path, "w", encoding="utf-8") as f:
                            f.write("\n\n".join(buffer_raw))  # Double newline between comments

                        
                        # ✅ THEN GET FILE SIZE (file now exists)
                        file_size = os.path.getsize(local_path)
                        
                        # Upload to HDFS
                        subprocess.run([
                            r"E:\hadoop\bin\hdfs.cmd", "dfs", "-put", "-f", local_path, hdfs_raw_dir
                        ], check=True)
                        
                        # Calculate duration
                        duration = time.time() - start_time
                        
                        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        print(f"[{timestamp}] Uploaded {filename} to HDFS folder: raw_batches | Lines: {len(buffer_raw)}")
                        
                        log_entry = {
                            "timestamp": timestamp,
                            "filename": filename,
                            "folder_type": "raw_batches",
                            "batch_number": batch_count_raw,
                            "lines_count": len(buffer_raw),
                            "file_size_bytes": file_size,
                            "file_size_kb": round(file_size / 1024, 2),
                            "processing_duration_sec": round(duration, 3),
                            "status": "success"
                        }
                        append_log_to_csv(log_entry)
                        
                        buffer_raw.clear()
                        last_write_time_raw = current_time
                        batch_count_raw += 1

                # ---------------- Filtered batch writing ----------------
                if len(buffer_filtered) >= 50 or (current_time - last_write_time_filtered >= 70):
                    if buffer_filtered:
                        # Start timing
                        start_time = time.time()
                        
                        filename = f"f_batch{batch_count_filtered}.txt"
                        local_path = os.path.join(filtered_batches, filename)
                        
                        # ✅ WRITE FILE FIRST
                        with open(local_path, "w", encoding="utf-8") as f:
                            f.write("\n".join(buffer_filtered))
                        
                        # ✅ THEN GET FILE SIZE (file now exists)
                        file_size = os.path.getsize(local_path)
                        
                        # Upload to HDFS
                        subprocess.run([
                            r"E:\hadoop\bin\hdfs.cmd", "dfs", "-put", "-f", local_path, hdfs_filtered_dir
                        ], check=True)
                        
                        # Calculate duration
                        duration = time.time() - start_time
                        
                        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        print(f"[{timestamp}] Uploaded {filename} to HDFS folder: filtered_batches | Lines: {len(buffer_filtered)}")
                        
                        log_entry = {
                            "timestamp": timestamp,
                            "filename": filename,
                            "folder_type": "filtered_batches",
                            "batch_number": batch_count_filtered,
                            "lines_count": len(buffer_filtered),
                            "file_size_bytes": file_size,
                            "file_size_kb": round(file_size / 1024, 2),
                            "processing_duration_sec": round(duration, 3),
                            "status": "success"
                        }
                        append_log_to_csv(log_entry)
                        
                        buffer_filtered.clear()
                        last_write_time_filtered = current_time
                        batch_count_filtered += 1

                time.sleep(1)

        except Exception as e:
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(f"[{timestamp}] Stream Error: {e}")
            time.sleep(5)

# Start the streaming thread
reddit_thread = threading.Thread(target=stream_reddit_comments, daemon=True)
reddit_thread.start()


[2025-10-27 00:46:08] Uploaded f_batch1.txt to HDFS folder: filtered_batches | Lines: 10
[2025-10-27 00:46:28] Uploaded r_batch1.txt to HDFS folder: raw_batches | Lines: 3
[2025-10-27 00:47:18] Uploaded f_batch2.txt to HDFS folder: filtered_batches | Lines: 7
[2025-10-27 00:47:58] Uploaded r_batch2.txt to HDFS folder: raw_batches | Lines: 4
[2025-10-27 00:48:29] Uploaded f_batch3.txt to HDFS folder: filtered_batches | Lines: 17
[2025-10-27 00:49:29] Uploaded r_batch3.txt to HDFS folder: raw_batches | Lines: 10
[2025-10-27 00:49:39] Uploaded f_batch4.txt to HDFS folder: filtered_batches | Lines: 8
[2025-10-27 00:50:49] Uploaded f_batch5.txt to HDFS folder: filtered_batches | Lines: 20
[2025-10-27 00:51:00] Uploaded r_batch4.txt to HDFS folder: raw_batches | Lines: 10
[2025-10-27 00:51:59] Uploaded f_batch6.txt to HDFS folder: filtered_batches | Lines: 8
[2025-10-27 00:52:30] Uploaded r_batch5.txt to HDFS folder: raw_batches | Lines: 8
[2025-10-27 00:53:10] Uploaded f_batch7.txt to HDFS 

In [11]:
# ---------------- SPARK STREAMING ----------------
os.environ["SPARK_HOME"] = r"E:\Coding\BDA-PySpark\spark-3.4.1-bin-hadoop3"
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
findspark.init()

spark = SparkSession.builder \
    .appName("RedditRealTimeProcessor") \
    .master("local[*]") \
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")



In [None]:
spark.streams.awaitAnyTermination()