In [30]:
import praw
import pandas as pd
import re
from datetime import datetime

# Set up Reddit API
reddit = praw.Reddit(
    client_id='yZgp5fHdkhZQwGSCQ6Of4Q',
    client_secret='yZgmiMH34SQlf2efwsf1zIeqWXEvoQ',
    user_agent='Bitcoin Sentiment Analysis'
)

# List of subreddits
subreddits = ['Bitcoin', 'CryptoCurrency', 'BitcoinMarkets', 'btc']  
posts = []

# Clean text utility
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.lower().strip()

# Loop through each subreddit
for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    
    # Fetch 50 posts per subreddit with keyword "Bitcoin"
    for submission in subreddit.search('Bitcoin', limit=50):
        try:
            submission.comments.replace_more(limit=0)
            # Extract up to 20 top-level comments
            top_comments = [comment.body for comment in submission.comments.list()[:20]]
            comment_text = " ".join([clean_text(comment) for comment in top_comments])
            
            title_clean = clean_text(submission.title)
            selftext_clean = clean_text(submission.selftext)
            combined_text = f"{title_clean} {selftext_clean} {comment_text}"
            
            posts.append([
                title_clean,
                selftext_clean,
                comment_text,
                combined_text,
                datetime.utcfromtimestamp(submission.created_utc),
                subreddit_name
            ])
        except Exception as e:
            print(f"⚠️ Skipping a post in r/{subreddit_name} due to error: {e}")
            continue

# Convert to DataFrame
df_reddit = pd.DataFrame(posts, columns=['title', 'selftext', 'comments', 'content', 'created_utc', 'subreddit'])
print(f"✅ Collected and cleaned {len(df_reddit)} posts (with comments) from {len(subreddits)} subreddits.")

 



✅ Collected and cleaned 200 posts (with comments) from 4 subreddits.


In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

def get_finbert_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
    scores = softmax(logits.numpy()[0])
    sentiment = scores[2] - scores[0]  # Positive - Negative
    return sentiment

df_reddit['content'] = df_reddit['title'] + ' ' + df_reddit['selftext'] + ' ' + df_reddit['comments']

df_reddit['sentiment'] = df_reddit['content'].apply(get_finbert_sentiment)


In [None]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

# Define time range
end_date = datetime.now()
start_date = end_date - timedelta(days=30)

# Download BTC data
btc_data = yf.download('BTC-USD', start=start_date, end=end_date, interval='1h')

# Fix multilevel columns
if isinstance(btc_data.columns, pd.MultiIndex):
    btc_data.columns = btc_data.columns.get_level_values(0)  # Keep just the first level

# Reset index
btc_data.reset_index(inplace=True)

# Rename 'index' or confirm 'Datetime' exists
btc_data.rename(columns={'index': 'Datetime'}, inplace=True)

# Confirm column structure
print("btc_data.columns:", btc_data.columns)

print("\nBitcoin Price Data (Last 30 Days - Hourly):")
print(btc_data)


[*********************100%***********************]  1 of 1 completed

btc_data.columns: Index(['Datetime', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')

Bitcoin Price Data (Last 10 Days - Hourly):
Price                  Datetime          Close           High            Low  \
0     2025-04-30 18:00:00+00:00   94103.210938   94447.710938   94047.625000   
1     2025-04-30 19:00:00+00:00   94237.242188   94237.242188   93769.265625   
2     2025-04-30 20:00:00+00:00   94653.367188   94721.539062   94168.609375   
3     2025-04-30 21:00:00+00:00   94419.125000   94733.960938   94419.125000   
4     2025-04-30 22:00:00+00:00   94120.554688   94700.273438   94105.445312   
..                          ...            ...            ...            ...   
714   2025-05-30 12:00:00+00:00  105738.054688  105969.914062  105310.445312   
715   2025-05-30 13:00:00+00:00  105535.015625  105956.718750  105439.250000   
716   2025-05-30 14:00:00+00:00  105411.601562  105825.414062  105120.132812   
717   2025-05-30 15:00:00+00:00  105571.35937




In [34]:
# Remove timezone from Reddit timestamps
df_reddit['created_utc'] = pd.to_datetime(df_reddit['created_utc']).dt.tz_localize(None)
btc_data['Datetime'] = pd.to_datetime(btc_data['Datetime']).dt.tz_localize(None)

# Create hourly timestamps
df_reddit['hour'] = df_reddit['created_utc'].dt.floor('H')
btc_data['hour'] = btc_data['Datetime'].dt.floor('H')

# Group Reddit sentiment by hour
sentiment_hourly = df_reddit.groupby('hour', as_index=False)['sentiment'].mean()

# ✅ Ensure btc_data has no MultiIndex
btc_data.columns = [col if isinstance(col, str) else col[0] for col in btc_data.columns]

# ✅ Merge on the hour column
data = pd.merge(btc_data, sentiment_hourly, on='hour', how='left')

# Fill missing sentiment with neutral
data['sentiment'].fillna(0, inplace=True)

# Select required columns
data = data[['Datetime', 'Close', 'sentiment']]

# Normalize
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['Close', 'sentiment']] = scaler.fit_transform(data[['Close', 'sentiment']])


  df_reddit['hour'] = df_reddit['created_utc'].dt.floor('H')
  btc_data['hour'] = btc_data['Datetime'].dt.floor('H')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sentiment'].fillna(0, inplace=True)


In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Dropout

def create_sequences(data, time_steps=90):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps])
        y.append(data[i + time_steps, 0])  # Close price
    return np.array(X), np.array(y)

dataset = data[['Close', 'sentiment']].values
time_steps = 90
X, y = create_sequences(dataset,time_steps)

split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X.shape[1], X.shape[2])),
    Dropout(0.1),
    LSTM(128),
    Dropout(0.1),
    Dense(1)
])
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x2ada905bb20>

In [36]:
# Get forecast horizon from the user
forecast_hours = int(input("Enter how many hours ahead you'd like to forecast (e.g., 24, 48, 168): "))

# Forecast loop
last_sequence = dataset[-time_steps:]
forecast_input = last_sequence.copy()
forecast_prices = []

for _ in range(forecast_hours):
    input_seq = np.expand_dims(forecast_input[-time_steps:], axis=0)
    pred = model.predict(input_seq, verbose=0)[0][0]
    forecast_prices.append(pred)
    forecast_input = np.vstack([forecast_input, [pred, forecast_input[-1][1]]])  # Use last sentiment

# Decode forecasted prices
decoded_prices = [scaler.inverse_transform([[p, 0]])[0][0] for p in forecast_prices]
last_actual_price = scaler.inverse_transform([[dataset[-1][0], 0]])[0][0]

# Calculate percentage fluctuations hour by hour
print("\n📈 Hourly Forecasted BTC Price Fluctuations:\n")
prev_price = last_actual_price
for i, price in enumerate(decoded_prices):
    change = ((price - prev_price) / prev_price) * 100
    direction = "↑" if change > 0 else "↓" if change < 0 else "→"
    print(f"Hour {i+1:>2}: Predicted Price = ${price:,.2f} | Change = {change:+.2f}% {direction}")
    prev_price = price

# Final summary
final_predicted_price = decoded_prices[-1]
percentage_change = ((final_predicted_price - last_actual_price) / last_actual_price) * 100

print("\n🔮 Final Forecast Summary:")

if percentage_change > 0.1:
    print(f"✅ Yes, the price is increasing by {percentage_change:.2f}% in the next {forecast_hours} hours.")
elif percentage_change < -0.1:
    print(f"❌ No, the price is decreasing by {abs(percentage_change):.2f}% in the next {forecast_hours} hours.")
else:
    print(f"⚖️ The predicted change is negligible ({percentage_change:.2f}%) in the next {forecast_hours} hours.")





📈 Hourly Forecasted BTC Price Fluctuations:

Hour  1: Predicted Price = $105,546.93 | Change = +0.27% ↑
Hour  2: Predicted Price = $105,536.26 | Change = -0.01% ↓
Hour  3: Predicted Price = $105,546.27 | Change = +0.01% ↑
Hour  4: Predicted Price = $105,565.45 | Change = +0.02% ↑
Hour  5: Predicted Price = $105,589.52 | Change = +0.02% ↑
Hour  6: Predicted Price = $105,616.53 | Change = +0.03% ↑
Hour  7: Predicted Price = $105,645.25 | Change = +0.03% ↑
Hour  8: Predicted Price = $105,674.80 | Change = +0.03% ↑
Hour  9: Predicted Price = $105,704.54 | Change = +0.03% ↑
Hour 10: Predicted Price = $105,733.97 | Change = +0.03% ↑
Hour 11: Predicted Price = $105,762.78 | Change = +0.03% ↑
Hour 12: Predicted Price = $105,790.72 | Change = +0.03% ↑
Hour 13: Predicted Price = $105,817.69 | Change = +0.03% ↑
Hour 14: Predicted Price = $105,843.63 | Change = +0.02% ↑
Hour 15: Predicted Price = $105,868.57 | Change = +0.02% ↑
Hour 16: Predicted Price = $105,892.54 | Change = +0.02% ↑
Hour 17: P

In [37]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on test data
y_pred = model.predict(X_test)

# Inverse transform
y_pred_inv = scaler.inverse_transform(np.c_[y_pred, np.zeros(len(y_pred))])[:, 0]
y_test_inv = scaler.inverse_transform(np.c_[y_test, np.zeros(len(y_test))])[:, 0]

# Evaluation metrics
mse = mean_squared_error(y_test_inv, y_pred_inv)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_inv, y_pred_inv)

def get_confidence_score(r2_score):
    if r2_score < 0:
        return 0
    elif r2_score > 1:
        return 100
    else:
        return round(r2_score * 100, 2)

confidence_score = get_confidence_score(r2)

print(f"\n📊 Evaluation Metrics:")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")
print(f"🔐 Model Confidence Score: {confidence_score:.2f}%")



📊 Evaluation Metrics:
MSE:  227553.6966
RMSE: 477.0259
R² Score: 0.8918
🔐 Model Confidence Score: 89.18%
