In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import praw
import json
import time

# Step 1: Configure Reddit API
REDDIT_CLIENT_ID = "Cats3X0jdhZSVG8r9Wsokg"
REDDIT_SECRET = "W-zJOVDj4m1kPHip-MzBqxt0kmGdPA"
REDDIT_USER_AGENT = "Puzzleheaded_Oil288"

# Initialize Reddit API
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_SECRET,
    user_agent=REDDIT_USER_AGENT,
    ratelimit_seconds=600  
)

# Step 2: Initialize Spark Session
spark = SparkSession.builder \
    .appName("RedditClustering") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

# Step 3: Define Reddit Streaming Parameters
subreddits = ["Bitcoin", "CryptoCurrency", "BitcoinMarkets"]
keywords = ["btc", "bitcoin", "whale alert", "crypto", "market sentiment", "bullish", "bearish"]

# Step 4: Function to Fetch Data from Reddit
def fetch_reddit_data(duration_minutes):
    start_time = time.time()
    collected_posts = []  

    while (time.time() - start_time) < (duration_minutes * 60):
        for subreddit_name in subreddits:
            subreddit = reddit.subreddit(subreddit_name)
            print(f"[INFO] Fetching real-time posts from {subreddit_name}...")
            
            for submission in subreddit.stream.submissions():
                if (time.time() - start_time) >= (duration_minutes * 60):
                    print("[INFO] Time limit reached. Stopping fetch.")
                    return collected_posts  

                if any(keyword.lower() in submission.title.lower() for keyword in keywords):
                    post = {
                        "id": submission.id,
                        "title": submission.title,
                        "score": submission.score,
                        "num_comments": submission.num_comments,
                        "created_utc": submission.created_utc,
                        "subreddit": subreddit_name
                    }
                    collected_posts.append(post)
                    print(f"[SENT] {submission.title[:50]}... | Score: {submission.score}")
                time.sleep(0.5)  

    return collected_posts  

# Step 5: Convert Reddit Data to Spark DataFrame
def process_reddit_data(spark, reddit_data):
    if not reddit_data:
        print("[INFO] No data fetched from Reddit.")
        return None

    json_data = [json.dumps(post) for post in reddit_data]
    df = spark.read.json(spark.sparkContext.parallelize(json_data))

    print("[INFO] Data Schema:")
    df.printSchema()
    df.show(10, truncate=False)
    return df

# Step 6: Perform K-means Clustering
def perform_kmeans_clustering(df):
    vector_assembler = VectorAssembler(inputCols=["score", "num_comments"], outputCol="features")
    df_vectorized = vector_assembler.transform(df).select("id", "features")

    kmeans = KMeans(k=3, seed=42, featuresCol="features", predictionCol="cluster")
    model = kmeans.fit(df_vectorized)
    clustered_df = model.transform(df_vectorized)

    print("[INFO] Clustered Data:")
    clustered_df.show(10, truncate=False)
    return clustered_df

# Step 7: Run the Reddit Fetch and Cluster Process
duration_minutes = 2  
reddit_data = fetch_reddit_data(duration_minutes)
reddit_df = process_reddit_data(spark, reddit_data)

if reddit_df:
    clustered_df = perform_kmeans_clustering(reddit_df)

25/02/09 05:28:59 WARN SparkConf: The configuration key 'spark.executor.port' has been deprecated as of Spark 2.0.0 and may be removed in the future. Not used anymore
25/02/09 05:28:59 WARN SparkConf: The configuration key 'spark.executor.port' has been deprecated as of Spark 2.0.0 and may be removed in the future. Not used anymore
25/02/09 05:28:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/02/09 05:28:59 WARN SparkConf: The configuration key 'spark.executor.port' has been deprecated as of Spark 2.0.0 and may be removed in the future. Not used anymore
25/02/09 05:28:59 WARN SparkConf: The configuration key 'spark.executor.port' has been deprecated as of Spark 2.0.0 and may be removed in the future. Not used anymore
25/02/09 05:28:59 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
[INFO] Fetching real-time posts from Bitcoin...
[SENT] This Is About You: Upvotes and Upticks, How Reddit... | Score: 15
[SENT] This would never happen with Bitcoin... | Score: 42
[SENT] This week in Bitcoin, wrapped into 4 minutes.... | Score: 4
[SENT] How much conviction do you have for bitcoin?... | Score: 88
[SENT] .01 BTC acquired!! lets go!... | Score: 859
[SENT] El Salvador 🇸🇻 + bitcoin + golf 🏌️‍♂️ = MAXimum vi... | Score: 0
[SENT] How porn links and Ben Be

                                                                                

[INFO] Data Schema:
root
 |-- created_utc: double (nullable = true)
 |-- id: string (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- score: long (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- title: string (nullable = true)

25/02/09 05:36:56 WARN TaskSetManager: Lost task 1.1 in stage 2.0 (TID 11) (172.18.0.7 executor 1): TaskKilled (Stage finished)
+-------------+-------+------------+-----+---------+---------------------------------------------------------------------------------+
|created_utc  |id     |num_comments|score|subreddit|title                                                                            |
+-------------+-------+------------+-----+---------+---------------------------------------------------------------------------------+
|1.738949127E9|1ik0ebi|0           |15   |Bitcoin  |This Is About You: Upvotes and Upticks, How Reddit’s Chatter Moves Crypto Markets|
|1.73894915E9 |1ik0en1|43          |42   |Bitcoin  |This would never happe

                                                                                

25/02/09 05:37:04 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/02/09 05:37:04 WARN TaskSetManager: Lost task 0.0 in stage 12.0 (TID 84) (172.18.0.8 executor 0): TaskKilled (Stage finished)
25/02/09 05:37:05 WARN SparkConf: The configuration key 'spark.executor.port' has been deprecated as of Spark 2.0.0 and may be removed in the future. Not used anymore


                                                                                

[INFO] Clustered Data:
+-------+-------------+-------+
|id     |features     |cluster|
+-------+-------------+-------+
|1ik0ebi|[15.0,0.0]   |0      |
|1ik0en1|[42.0,43.0]  |0      |
|1ik0uhp|[4.0,0.0]    |0      |
|1ik1t7x|[88.0,161.0] |0      |
|1ik2udm|[859.0,125.0]|2      |
|1ik2v1c|(2,[],[])    |0      |
|1ik2wvh|[0.0,2.0]    |0      |
|1ik382d|[9.0,4.0]    |0      |
|1ik3k58|[0.0,5.0]    |0      |
|1ik3nqv|[45.0,1.0]   |0      |
+-------+-------------+-------+
only showing top 10 rows

