In [1]:
import os
import pandas as pd
import spacy
import subprocess
import glob


In [2]:

# CONFIG

input_path = r"hdfs://localhost:9000/user/adarsh/realtime_pipeline/raw_batches"
local_fallback_path = r"E:\Coding\BDA-PySpark\realtime-pipeline\reddit_streaming\raw_batches"
hadoop_bin = r"E:\hadoop\bin\hdfs.cmd"  # Full path to your hdfs.cmd file

output_dir = r"E:\Coding\BDA-PySpark\realtime-pipeline\results_spark"
final_output_file = os.path.join(output_dir, "entity_spark.csv")

os.makedirs(output_dir, exist_ok=True)

In [3]:
# HELPER: READ FILES FROM HDFS OR LOCAL

def read_hdfs_texts(hdfs_path):
    try:
        print(f"Attempting to read from HDFS: {hdfs_path}")
        result = subprocess.run(
            [hadoop_bin, "dfs", "-cat", f"{hdfs_path}/*.txt"],
            capture_output=True,
            text=True,
            check=True
        )
        lines = [line.strip() for line in result.stdout.split("\n") if line.strip()]
        print(f"Loaded {len(lines)} comments from HDFS.")
        return lines

    except FileNotFoundError:
        print("Hadoop CLI not found. Falling back to local folder...")
    except subprocess.CalledProcessError as e:
        print(f"Error reading from HDFS: {e}. Falling back to local folder")

    # Local fallback
    local_files = glob.glob(os.path.join(local_fallback_path, "*.txt"))
    if not local_files:
        print("No local batch files found.")
        return []
    all_lines = []
    for f in local_files:
        with open(f, "r", encoding="utf-8", errors="ignore") as infile:
            all_lines.extend([line.strip() for line in infile if line.strip()])
    print(f"Loaded {len(all_lines)} comments from local batches.")
    return all_lines

# ENTITY EXTRACTION LOGIC

NLP = spacy.load("en_core_web_sm")

def run_entity_extraction(comments):
    result_comments = []
    result_entities = []

    for text in comments:
        text = text.strip()
        if not text or len(text.split()) < 5:
            continue
        try:
            doc = NLP(text)
            ent_list = [f"{ent.text}:{ent.label_}" for ent in doc.ents]
            entities_str = ", ".join(ent_list) if ent_list else "No entities found"
            result_comments.append(text)
            result_entities.append(entities_str)
        except Exception as e:
            print(f"Error processing text: {e}")
            result_comments.append(text)
            result_entities.append("Error processing")

    df = pd.DataFrame({
        "comment": result_comments,
        "entities": result_entities
    })

    # Summary
    
    entity_types = {}
    for ent_str in result_entities:
        if ent_str and ent_str not in ["No entities found", "Error processing"]:
            for entity_pair in ent_str.split(", "):
                if ":" in entity_pair:
                    _, ent_label = entity_pair.rsplit(":", 1)
                    entity_types[ent_label] = entity_types.get(ent_label, 0) + 1

    summary = pd.Series(entity_types) if entity_types else pd.Series(dtype=int)
    return df, summary



   

In [4]:
# MAIN EXECUTION
if __name__ == "__main__":
    print("Starting entity extraction pipeline...")

    comments = read_hdfs_texts(input_path)
    if not comments:
        print("No data found in HDFS or local path.")
        exit(0)

    df_entities, summary = run_entity_extraction(comments)

    # Save results
    df_entities.to_csv(final_output_file, index=False, encoding="utf-8")
    print(f"Entities saved to: {final_output_file}")


Starting entity extraction pipeline...
Attempting to read from HDFS: hdfs://localhost:9000/user/adarsh/realtime_pipeline/raw_batches
Loaded 134 comments from HDFS.
Entities saved to: E:\Coding\BDA-PySpark\realtime-pipeline\results_spark\entity_spark.csv
