In [7]:
import os
import pandas as pd
import numpy as np
from rake_nltk import Rake
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import subprocess
import glob

In [8]:
# CONFIG

input_path = r"hdfs://localhost:9000/user/adarsh/realtime_pipeline/raw_batches"
local_fallback_path = r"E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\raw_batches"  
hadoop_bin = r"E:\hadoop\bin\hdfs.cmd" 

output_dir = r"E:\Coding\BDA-PySpark\realtime-pipeline\results_spark"
final_output_file = os.path.join(output_dir, "keywords_spark.csv")


os.makedirs(output_dir, exist_ok=True)

In [9]:
# HELPER: READ FILES FROM HDFS OR LOCAL

def read_hdfs_texts(hdfs_path):
   
    try:
        print(f"Attempting to read from HDFS: {hdfs_path}")
        result = subprocess.run(
            [hadoop_bin, "dfs", "-cat", f"{hdfs_path}/*.txt"],
            capture_output=True,
            text=True,
            check=True
        )
        lines = [line.strip() for line in result.stdout.split("\n") if line.strip()]
        print(f"Loaded {len(lines)} comments from HDFS.")
        return lines

    except FileNotFoundError:
        print("Hadoop CLI not found. Falling back to local folder..")
    except subprocess.CalledProcessError as e:
        print(f"Error reading from HDFS: {e}. Falling back to local folder..")

    # Local Fallback 
    local_files = glob.glob(os.path.join(local_fallback_path, "*.txt"))
    if not local_files:
        print("No local batch files found either.")
        return []
    all_lines = []
    for f in local_files:
        with open(f, "r", encoding="utf-8", errors="ignore") as infile:
            all_lines.extend([line.strip() for line in infile if line.strip()])
    print(f"Loaded {len(all_lines)} comments from local batches.")
    return all_lines


In [10]:
# RAKE + TF-IDF SETUP
RAKE = Rake(min_length=1, max_length=4, include_repeated_phrases=False)

# Custom stopwords
CUSTOM_STOPWORDS = {
    'i','me','my','mine','you','your','yours','he','him','his','she','her','hers','it','its','we','us','our','ours',
    'they','them','their','theirs','a','an','the','and','but','or','if','because','as','while','of','at','by','for',
    'with','about','against','between','into','through','during','before','after','above','below','to','from','up',
    'down','in','out','on','off','over','under','what','where','when','who','whom','which','why','how',
    'hi','hello','hey','thanks','thank','please','welcome','sure','yeah','ok','okay','alright','fine','right','well',
    'oh','hmm','ah','uh','um','huh','wow','oops','hahaha','haha','lol','really','very','just','actually','maybe',
    'probably','basically','literally','clearly','obviously','definitely','simply','honestly','apparently','anyway',
    'somehow','sometimes','often','always','lot','lots','thing','things','stuff','something','anything','everything',
    'nothing','some','many','much','more','less','few','kind','kinda','sort','sorta','type','types','group','bunch',
    'can','cannot','could','should','would','may','might','must','shall','will','did','does','doing','done','make',
    'makes','made','take','takes','taken','say','says','said','see','seen','seems','know','knows','knew','think',
    'thought','want','wants','wanted','go','goes','went','gone','get','gets','got','gotten','come','came','coming',
    'put','puts','putting','try','trying','tried','use','used','using','feel','feels','felt','give','gave','given',
    'edit','deleted','removed','update','reddit','comment','post','thread','user','bot','mod','mods','report','flair',
    'image','link','url','amp','nbsp','lt','gt','http','https','com'
}

STOPWORDS = list(ENGLISH_STOP_WORDS.union(CUSTOM_STOPWORDS))

In [11]:
# KEYWORD EXTRACTION FUNCTIONS
def filter_topic_keywords(ranked_phrases_with_scores, min_score=1.0):
    filtered = []
    for item in ranked_phrases_with_scores:
        if isinstance(item, tuple) and len(item) == 2:
            score, phrase = item
            phrase_words = phrase.lower().split()
            if isinstance(score, float) and score >= min_score and all(w not in STOPWORDS for w in phrase_words):
                filtered.append(phrase)
        elif isinstance(item, str):
            phrase_words = item.lower().split()
            if all(w not in STOPWORDS for w in phrase_words):
                filtered.append(item)
    return filtered[:10]

def extract_tfidf_keywords(texts, n_keywords=5):
    try:
        vectorizer = TfidfVectorizer(
            max_features=1000,
            stop_words=STOPWORDS,
            ngram_range=(1, 3),
            min_df=2,
            max_df=0.8
        )
        X = vectorizer.fit_transform(texts)
        feature_names = vectorizer.get_feature_names_out()
        avg_scores = np.asarray(X.mean(axis=0)).flatten()
        top_indices = avg_scores.argsort()[::-1][:n_keywords]
        return [feature_names[i] for i in top_indices]
    except Exception as e:
        print(f"TF-IDF extraction error: {e}")
        return []

def run_keyword_extraction(comments, use_tfidf=True):
    result_comments, result_keywords = [], []

    for text in comments:
        text = text.strip()
        if not text or len(text.split()) < 5:
            continue
        try:
            RAKE.extract_keywords_from_text(text)
            ranked_phrases = RAKE.get_ranked_phrases_with_scores()
            topic_keywords = filter_topic_keywords(ranked_phrases)
            topic_keywords = [kw.replace(",", "") for kw in topic_keywords]
            keywords_str = " ".join(topic_keywords) if topic_keywords else "NoKeywords"
            result_comments.append(text)
            result_keywords.append(keywords_str)
        except Exception as e:
            print(f"Error extracting keywords: {e}")
            result_comments.append(text)
            result_keywords.append("Error processing")

    df = pd.DataFrame({"comment": result_comments, "keywords": result_keywords})

    # Compute summary
    all_keywords = []
    for kw_str in result_keywords:
        if kw_str and kw_str not in ["No keywords found", "Error processing"]:
            all_keywords.extend(kw_str.split())
    summary = pd.Series(all_keywords).value_counts() if all_keywords else pd.Series(dtype=int)
    return df, summary

def extract_topic_themes(df, n_topics=5):
    """Return the most frequent keywords (topic themes)."""
    all_keywords = []
    for kw_str in df['keywords']:
        if kw_str and kw_str not in ["No keywords found", "Error processing"]:
            all_keywords.extend(kw_str.split())
    keyword_counts = Counter(all_keywords)
    top_themes = keyword_counts.most_common(n_topics)
    return top_themes



In [12]:
# MAIN EXECUTION

if __name__ == "__main__":
    print("Starting keyword extraction pipeline...")

    comments = read_hdfs_texts(input_path)
    if not comments:
        print("No data found in HDFS or local path.")
        exit(0)

    df_keywords, summary = run_keyword_extraction(comments, use_tfidf=True)

    # Save results
    df_keywords.to_csv(final_output_file, index=False, encoding='utf-8')
    print(f"Keywords saved to: {final_output_file}")

    # Display top topic themes
    extract_topic_themes(df_keywords, n_topics=5)


Starting keyword extraction pipeline...
Attempting to read from HDFS: hdfs://localhost:9000/user/adarsh/realtime_pipeline/raw_batches
Loaded 134 comments from HDFS.
Keywords saved to: E:\Coding\BDA-PySpark\realtime-pipeline\results_spark\keywords_spark.csv
