### installations and imports

In [13]:
from datetime import datetime
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, pipeline
import praw
import requests
import pandas as pd
import time
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

### fetch bitcoin data from 2017 to 2024

In [None]:
def get_binance_data(symbol, interval, start_str, end_str=None):
    base_url = "https://api.binance.com"
    endpoint = "/api/v3/klines"
    
    start_ts = int(pd.to_datetime(start_str).timestamp() * 1000)
    end_ts = int(pd.to_datetime(end_str).timestamp() * 1000) if end_str else None
    
    data = []
    count = 0
    
    while True:
        params = {
            'symbol': symbol,
            'interval': interval,
            'startTime': start_ts,
            'limit': 1000
        }
        if end_ts:
            params['endTime'] = end_ts
        
        try:
            response = requests.get(base_url + endpoint, params=params)
            response.raise_for_status()
            
            results = response.json()
            
            if not results:
                break
            
            data.extend(results)
            
            start_ts = results[-1][0] + 1
            
            count += len(results)
            last_timestamp = pd.to_datetime(results[-1][0], unit='ms')
            print(f"Fetched {count} records, up to {last_timestamp}")
            
            if len(results) < 1000:
                break
            
        except requests.exceptions.HTTPError as err:
            if response.status_code == 429:
                print("Rate limit exceeded. Waiting before retrying...")
                time.sleep(60)  # Wait 1 minute before retrying
            else:
                raise err
    
    return data

def save_to_csv(data, filename):
    columns = [
        'open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 
        'quote_asset_volume', 'number_of_trades', 'taker_buy_base_asset_volume', 
        'taker_buy_quote_asset_volume', 'ignore'
    ]
    df = pd.DataFrame(data, columns=columns)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.to_csv(filename, index=False)

if __name__ == "__main__":
    symbol = "BTCUSDT"
    interval = "1m"
    start_date = "2010-01-01"
    end_date = "2024-05-27"
    
    print("Starting data fetch...")
    data = get_binance_data(symbol, interval, start_date, end_date)
    print("Data fetch complete. Saving to CSV...")
    save_to_csv(data, 'bitcoin_dataset.csv')
    print("Data saved to bitcoin_1min_data.csv")

### load transformer model

In [4]:
# Initialize sentiment analysis model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
max_length = 512  # Roberta's maximum sequence length


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Total posts processed: 146


### configuration of reddit api

In [None]:
# Reddit API credentials
client_id = '-DikpAFUeeajlFFyWTBqUg'
client_secret = 'yZu1rPuBINVPuI7LuugQybGwtX1Cdg'
user_agent = 'bitcoin data'
username = '73malik'
password = 'Stapler437581'

# Create Reddit instance
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent,
    username=username,
    password=password
)

# Subreddit and Keyword definition
subreddit_name = 'all'
keywords = ['bitcoin price prediction', 'bitcoin market analysis', 'bitcoin trend', 'bitcoin forecast', 'bitcoin news']


### retrieve reddit posts

In [None]:
# Fetch and analyze Reddit posts
def fetch_reddit_posts(subreddit_name, keywords, total_limit=1000, score_threshold=10):
    posts = []
    counter = 0
    
    for keyword in keywords:
        for submission in reddit.subreddit(subreddit_name).search(keyword, sort='new', limit=total_limit):
            if counter >= total_limit:
                break
            if submission.score > score_threshold:
                created_date = datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
                text = submission.title + " " + submission.selftext
                truncated_text = tokenizer.decode(tokenizer(text, truncation=True, max_length=max_length)["input_ids"], skip_special_tokens=True)
                
                # Get the sentiment
                sentiment = sentiment_pipeline(truncated_text)[0]['label']
                
                posts.append({
                    'created': created_date,
                    'title': submission.title,
                    'score': submission.score,
                    'url': submission.url,
                    'content': submission.selftext,
                    'sentiment': sentiment
                })
                counter += 1
                
    return pd.DataFrame(posts)

posts_df = fetch_reddit_posts(subreddit_name, keywords)
posts_df.to_csv('reddit_posts.csv', index=False)
print(f"Total posts processed: {len(posts_df)}")

### prepare data

In [15]:
def prepare_data(price_df, sentiment_df, sequence_length=60):
    # Convert timestamps to datetime
    sentiment_df['created'] = pd.to_datetime(sentiment_df['created'])
    price_df['timestamp'] = pd.to_datetime(price_df['open_time'])

    # Ensure both DataFrames are sorted by their respective keys
    price_df = price_df.sort_values('timestamp')
    sentiment_df = sentiment_df.sort_values('created')

    # Merge price and sentiment data on timestamp
    merged_df = pd.merge_asof(price_df, sentiment_df, left_on='timestamp', right_on='created', direction='backward')

    # Fill missing sentiment values with neutral sentiment
    merged_df['sentiment'] = merged_df['sentiment'].fillna('neutral')
    sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
    merged_df['sentiment_score'] = merged_df['sentiment'].map(sentiment_mapping)
    
    # Normalize 'close' prices
    scaler = MinMaxScaler(feature_range=(0, 1))
    merged_df['close'] = scaler.fit_transform(merged_df['close'].values.reshape(-1, 1))

    # Check for NaN values in merged DataFrame
    if merged_df.isnull().values.any():
        print("Merged DataFrame contains NaN values. Filling NaN values with forward fill.")
        merged_df.fillna(method='ffill', inplace=True)
    
    # Prepare sequences
    data = merged_df[['close', 'sentiment_score']].values
    x, y = [], []
    for i in range(len(data) - sequence_length):
        x.append(data[i:i + sequence_length])
        y.append(data[i + sequence_length][0])  # Price prediction target

    x, y = np.array(x), np.array(y)

    # Final check for NaN and infinite values
    if np.any(np.isnan(x)) or np.any(np.isnan(y)):
        raise ValueError("Prepared data contains NaN values after processing.")
    if np.any(np.isinf(x)) or np.any(np.isinf(y)):
        raise ValueError("Prepared data contains infinite values after processing.")

    return x, y

# Load the price and sentiment data
price_df = pd.read_csv('bitcoin_dataset.csv')
sentiment_df = pd.read_csv('reddit_posts.csv')

# Prepare the data
x, y = prepare_data(price_df, sentiment_df)
print(f"Prepared data shapes: x={x.shape}, y={y.shape}")

Merged DataFrame contains NaN values. Filling NaN values with forward fill.


  merged_df.fillna(method='ffill', inplace=True)


Prepared data shapes: x=(3555069, 60, 2), y=(3555069,)


#### old

In [7]:
def prepare_data(price_df, sentiment_df):
    # Convert timestamps to datetime
    sentiment_df['created'] = pd.to_datetime(sentiment_df['created'])
    price_df['timestamp'] = pd.to_datetime(price_df['open_time'])

    # Ensure both DataFrames are sorted by their respective keys
    price_df = price_df.sort_values('timestamp')
    sentiment_df = sentiment_df.sort_values('created')

    # Merge price and sentiment data on timestamp
    merged_df = pd.merge_asof(price_df, sentiment_df, left_on='timestamp', right_on='created', direction='backward')

    # Fill missing sentiment values with neutral sentiment
    merged_df['sentiment'] = merged_df['sentiment'].fillna('neutral')
    sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
    merged_df['sentiment_score'] = merged_df['sentiment'].map(sentiment_mapping)
    
    # Prepare sequences
    sequence_length = 60  # e.g., last 60 minutes
    data = merged_df[['close', 'sentiment_score']].values
    x, y = [], []
    for i in range(len(data) - sequence_length):
        x.append(data[i:i + sequence_length])
        y.append(data[i + sequence_length][0])  # Price prediction target
    return np.array(x), np.array(y)

# Load the price and sentiment data
price_df = pd.read_csv('bitcoin_dataset.csv')
sentiment_df = pd.read_csv('reddit_posts.csv')

# Prepare the data
x, y = prepare_data(price_df, sentiment_df)
print(f"Prepared data shapes: x={x.shape}, y={y.shape}")

Prepared data shapes: x=(3555069, 60, 2), y=(3555069,)


### build and train lstm model

In [23]:
# Print TensorFlow version
print("TensorFlow version:", tf.__version__)

# Check for GPU availability
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))

# If GPU is available, set memory growth
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        tf.config.set_visible_devices(physical_devices[0], 'GPU')
        print(f"Using GPU: {physical_devices[0]}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU available")

TensorFlow version: 2.16.1
Num GPUs Available:  0
No GPU available


In [25]:
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(25))
    model.add(Dense(1))
    
    optimizer = Adam(learning_rate=0.001)  # Reduced learning rate
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model


In [26]:

# Build the LSTM model
input_shape = (x.shape[1], x.shape[2])
lstm_model = build_lstm_model(input_shape)


In [27]:

# Train the model
lstm_model.fit(x, y, batch_size=32, epochs=10, validation_split=0.2)


Epoch 1/10


In [None]:

# Save the trained model
lstm_model.save('lstm_model.h5')

#### old

In [None]:

def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(25))
    model.add(Dense(1))
    
    optimizer = Adam(learning_rate=0.001)  # Reduced learning rate
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Build the LSTM model
input_shape = (x.shape[1], x.shape[2])
lstm_model = build_lstm_model(input_shape)

# Train the model
lstm_model.fit(x, y, batch_size=32, epochs=10, validation_split=0.2)

# Save the trained model
lstm_model.save('lstm_model.h5')

### predict future prices

In [9]:
# Load the trained model (if needed)
lstm_model = load_model('lstm_model.h5')

# Predict future prices
predictions = lstm_model.predict(x)

# Compare predictions with actual prices
import matplotlib.pyplot as plt

plt.figure(figsize=(14,5))
plt.plot(y, color='blue', label='Actual Bitcoin Price')
plt.plot(predictions, color='red', label='Predicted Bitcoin Price')
plt.title('Bitcoin Price Prediction')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'lstm_model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)