In [None]:
# =============================================================================
# Import Libraries
# =============================================================================
import re
import os
import requests                     # For making API calls to CoinMarketCap
import pandas as pd                 # For data manipulation and analysis
import numpy as np                  # For numerical operations
import matplotlib.pyplot as plt     # For plotting graphs
import praw
import datetime                     # For handling date and time information
import nltk                         # For Natural Language Processing
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer  # VADER for sentiment analysis
from sklearn.linear_model import LinearRegression           # For predictive modeling 
from sklearn.metrics import mean_absolute_error, r2_score      # For model evaluation


# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [3]:

# Sample data (replace with your actual DataFrame)
df = pd.DataFrame({
    'created_utc': pd.to_datetime([
        "2025-07-07 23:59:53",
        "2025-07-07 23:50:14",
        "2025-07-07 23:30:48",
        "2025-07-07 22:41:58",
        "2025-07-07 21:47:55",
    ]),
    'text': [
        "What just happened in my account?? Scammed?",
        "Securing IoMT data with Algorand blockchain, X chain focus",
        "Has anyone seen these chains/coins before? I’m curious about ADA performance.",
        "I paid 6.7 Trillion to ones who got Goxxed",
        "U.S. government moves Ethereum to Coinbase; Insights and analysis.",
    ]
})

# Initialize zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define candidate labels for relevance
candidate_labels = [
    "buy signal", 
    "sell signal", 
    "market analysis",
    "security issue",
    "technical update",
    "regulatory news"
]

def classify_relevance(text, threshold=0.5):
    """
    Classify a text for relevance to cryptocurrency trading signals/context.
    Returns True if any label has score >= threshold.
    """
    result = classifier(text, candidate_labels)
    # Check if any label score exceeds the threshold
    return any(score >= threshold for score in result['scores'])

# Apply classification to DataFrame
df['is_relevant'] = df['text'].apply(classify_relevance)

# Show filtered results
filtered_df = df[df['is_relevant']].copy()

import ace_tools as tools; tools.display_dataframe_to_user(name="AI-Filtered Relevant Posts", dataframe=filtered_df)


NameError: name 'pipeline' is not defined

In [4]:
# =============================================================================
# Function: get_reddit_posts
# =============================================================================
reddit = praw.Reddit(
        client_id="EONAlz7JyvxOCdJJ8xpe6A",
        client_secret="X-FWW2OvuBV-Pll7OH3c4RNLzYah4Q",
        user_agent="crypto-sentiment-bot/0.1 (by /u/BuzzKG)")

def get_reddit_posts_praw(subreddit, limit=200, sleep_between=1.0):
    """
    Fetches the newest posts from a subreddit using PRAW (Reddit's official API).
    Respects Reddit rate limits by sleeping between requests.

    Parameters:
        subreddit (str): Name of the subreddit (e.g., 'cryptocurrency').
        limit (int): Number of posts to retrieve.
        sleep_between (float): Seconds to sleep between batch fetches.

    Returns:
        pd.DataFrame: DataFrame with 'created_utc' and 'text' columns.
    """
    reddit = praw.Reddit(
        client_id="EONAlz7JyvxOCdJJ8xpe6A",
        client_secret="X-FWW2OvuBV-Pll7OH3c4RNLzYah4Q",
        user_agent="crypto-sentiment-bot/0.1 (by /u/BuzzKG)"
    )
    

def fetch_flair_posts(subreddit_name, limit=5000, flairs=None, sleep=0.5):
    """
    Fetch up to `limit` latest posts whose flair is in `flairs`.
    Returns a DataFrame with id, created_utc, title, selftext, flair, score, num_comments.
    """
    records = []
    sub = reddit.subreddit(subreddit_name)
    for submission in sub.new(limit=limit):
        flair = (submission.link_flair_text or "").strip()
        if flairs and flair not in flairs:
            continue

        records.append({
            "id":           submission.id,
            "created_utc":  pd.to_datetime(submission.created_utc, unit="s"),
            "flair":        flair,
            "title":        submission.title,
            "selftext":     submission.selftext or "",
            "score":        submission.score,
            "num_comments": submission.num_comments
        })
        time.sleep(sleep)
    return pd.DataFrame(records)

# Define the four target flairs (adjust exact text as the sub uses it)
TARGET_FLAIRS = {"MEME"}

df = fetch_flair_posts("CryptoCurrency", limit=3000, flairs=TARGET_FLAIRS)
print(f"Fetched {len(df)} posts with target flairs.")


Fetched 29 posts with target flairs.


In [5]:
print(df.head)

<bound method NDFrame.head of          id         created_utc flair  \
0   1mphuym 2025-08-13 21:52:50  MEME   
1   1mpggdb 2025-08-13 20:58:15  MEME   
2   1mpao1m 2025-08-13 17:23:10  MEME   
3   1mp90j9 2025-08-13 16:22:41  MEME   
4   1moxrn9 2025-08-13 07:19:52  MEME   
5   1mofkek 2025-08-12 17:46:03  MEME   
6   1mne84g 2025-08-11 14:17:48  MEME   
7   1mn8wzg 2025-08-11 10:03:37  MEME   
8   1mn1fpx 2025-08-11 02:36:44  MEME   
9   1mm1357 2025-08-09 21:40:23  MEME   
10  1ml8izs 2025-08-08 22:06:13  MEME   
11  1ml66ne 2025-08-08 20:30:29  MEME   
12  1mkpyff 2025-08-08 08:44:31  MEME   
13  1mkb30j 2025-08-07 20:27:48  MEME   
14  1mk0hs9 2025-08-07 13:44:25  MEME   
15  1mjbn4t 2025-08-06 17:53:32  MEME   
16  1mja1ee 2025-08-06 16:54:03  MEME   
17  1mimi7f 2025-08-05 21:49:59  MEME   
18  1miclha 2025-08-05 15:39:59  MEME   
19  1mi0zi2 2025-08-05 05:42:38  MEME   
20  1mgytw8 2025-08-04 00:11:16  MEME   
21  1mgbt7v 2025-08-03 05:59:51  MEME   
22  1mg38zv 2025-08-02 22:3

In [6]:
# =============================================================================
# Load Reddit Data from Kaggle
# =============================================================================
# Read the CSV file containing Reddit posts. The file should include columns like:
# 'created_utc', 'title', and 'selftext'.
reddit_kaggle_df = pd.read_csv('reddit_cryptocurrency_posts.csv')

# If there is no 'text' column, combine 'title' and 'selftext' to form a complete post text.
if 'text' not in reddit_kaggle_df.columns:
    reddit_kaggle_df['text'] = reddit_kaggle_df['title'].fillna('') + " " + reddit_kaggle_df['selftext'].fillna('')

# Convert the 'created_utc' column to datetime.
# If the values are UNIX epoch seconds, specify unit='s'; otherwise, let pandas infer the format.
if reddit_kaggle_df['created_utc'].dtype in [int, float]:
    reddit_kaggle_df['created_utc'] = pd.to_datetime(reddit_kaggle_df['created_utc'], unit='s')
else:
    reddit_kaggle_df['created_utc'] = pd.to_datetime(reddit_kaggle_df['created_utc'])

print("Loaded Reddit Data from Kaggle:")
print(reddit_kaggle_df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'reddit_cryptocurrency_posts.csv'