## Data Collection

Import dataset from the hugging face datasets

In [None]:
import pandas as pd

df = pd.read_csv("hf://datasets/edaschau/bitcoin_news/BTC_yahoo.csv")

In [None]:
df.head()

In [None]:
df_selected_features = df[["date_time", "title", "source"]]
df_selected_features.head()

In [None]:
df_selected_features.info()

In [None]:
btc_news = df_selected_features.copy()

In [None]:
split_df = btc_news["date_time"].str.split(" ")
btc_news["date"] = split_df.str[0]
btc_news["time"] = split_df.str[1]

btc_news.head()

In [None]:
btc_news["date"] = pd.to_datetime(btc_news["date"], format="%Y-%m-%d")
btc_news["date"]

In [None]:
btc_news.drop(columns=["time", "date_time"], inplace=True)

Assigning sentiment score to the article titles

In [None]:
!pip install transformers

Importing necessary libraries

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch

Check if GPU available

In [None]:
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")


In [None]:
# Load the FinBERT model with GPU support
model_name = "ProsusAI/finbert"
finbert_pipeline = pipeline("sentiment-analysis", model=model_name, batch_size=64, device=device)  # Increased batch_size

# Convert titles to a list for batch processing
titles = btc_news['title'].tolist()

# Define batch size (Increase for faster processing if GPU memory allows)
batch_size = 64

# Store results
results = []

# Process in batches
for i in tqdm(range(0, len(titles), batch_size), desc="Processing Batches with GPU"):
    batch = titles[i:i+batch_size]
    try:
        result_batch = finbert_pipeline(batch)
        results.extend(result_batch)
    except Exception as e:
        print(f"Error processing batch {i} to {i+batch_size}: {str(e)}")
        results.extend([{'label': None, 'score': None}] * len(batch))

# Convert results to DataFrame columns
btc_news['finbert_sentiment'] = [r['label'] if r else None for r in results]
btc_news['finbert_score'] = [r['score'] if r else None for r in results]

# Save the updated DataFrame to a new CSV file
btc_news.to_csv('bitcoin_articles_with_finbert_sentiment.csv', index=False)

print("Processing completed successfully!")

In [None]:
btc_news.head()

In [None]:
btc_news.finbert_sentiment.value_counts()

In [None]:
btc_news.source.value_counts()

## EDA

In [None]:
import pandas as pd

df = pd.read_csv("/content/bitcoin_articles_with_finbert_sentiment.csv")
df.head()

Proprotion of publisher on bitcoin

In [None]:
articles_df = df["source"].value_counts()
percentage = (articles_df / articles_df.sum()) * 100

In [None]:
def group_articles(row, percentage):
  if percentage[row["source"]] < 3:
    return "Other"
  else:
    return row["source"]

grouped_articles = df.apply(lambda row: group_articles(row, percentage), axis=1)
grouped_articles.value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

visualize_articles = grouped_articles.value_counts()
plt.figure(figsize =(20, 10))
plt.pie(visualize_articles, labels = visualize_articles.index, autopct = "%.2f%%")
plt.title("Proportion of publisher on bitcoin")
plt.show()



Prorortion of the news according to the months by sentiment

In [None]:
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["date"]

In [None]:
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
group_by_year = df.groupby(["year", "month", "finbert_sentiment"]).size().reset_index(name="count")
group_by_year

In [None]:
group_by_year['month_total'] = group_by_year.groupby(['year', 'month'])['count'].transform('sum')
group_by_year['proportion'] = group_by_year['count'] / group_by_year['month_total']

In [None]:
btc = pd.read_csv("BTC.csv")
btc.head()

In [None]:
btc["date"] = pd.to_datetime(btc["date"])

In [None]:
btc = btc.sort_values(by="date")

In [None]:
btc["year"] = btc["date"].dt.year
btc["month"] = btc["date"].dt.month

In [None]:
monthly_change = btc.groupby(['year', 'month'])['close'].agg(['first', 'last']).reset_index()
monthly_change['percent_change'] = ((monthly_change['last'] - monthly_change['first']) / monthly_change['first']) * 100

In [None]:
monthly_change

In [None]:
monthly_change[monthly_change["year"] == 2012]

In [None]:
years = df["year"].unique()
years

In [None]:
# Pivotting table with proportion of sentiments
pivot = group_by_year[group_by_year['year'] == 2012].pivot_table(
    index='month',
    columns='finbert_sentiment',
    values='proportion',
    fill_value = 0)

# Ensure all three sentiment columns exist
for sentiment in ['negative', 'neutral', 'positive']:
    if sentiment not in pivot.columns:
        pivot[sentiment] = 0


pivot = pivot[['negative', 'neutral', 'positive']]

months = pivot.index.tolist()
negative_vals = pivot['negative'].tolist()
neutral_vals = pivot['neutral'].tolist()
positive_vals = pivot['positive'].tolist()

pivot


In [None]:
import numpy as np

for year in years:
  # Taking sample of btc prices according to the year
  monthly_change_sample = monthly_change[monthly_change["year"] == year]
  months = monthly_change_sample["month"].tolist()
  x = np.arange(len(months))

  # Pivotting table with proportion of sentiments
  pivot = group_by_year[group_by_year['year'] == year].pivot_table(
      index='month',
      columns='finbert_sentiment',
      values='proportion',
      fill_value = 0)

  # Ensure all three sentiment columns exist
  for sentiment in ['negative', 'neutral', 'positive']:
      if sentiment not in pivot.columns:
          pivot[sentiment] = 0

  pivot = pivot.reindex(index=range(1, 13), fill_value=0)
  pivot = pivot[['negative', 'neutral', 'positive']]


  negative_vals = pivot['negative'].tolist()
  neutral_vals = pivot['neutral'].tolist()
  positive_vals = pivot['positive'].tolist()

  # Creating 2 plots with price change and proporiton of sentiments
  fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(18, 12))

  # Price change bar plot
  dynamic_color_list = ["green" if x > 0 else "red" for x in monthly_change_sample["percent_change"]]
  ax1.bar(x, monthly_change_sample["percent_change"], color=dynamic_color_list)
  ax1.set_title(f"Percent Change of the BTC Price {year}")
  ax1.set_ylabel("Percent Change")

  # Sentiment bar plots
  bar_width = 0.35
  ax2.bar(x - bar_width/2, negative_vals, width=bar_width, label='Negative', color='red')
  ax2.bar(x + bar_width/2, positive_vals, width=bar_width, label='Positive', color='green')
  ax2.set_ylabel("Sentiments Proportion")
  ax2.set_title("Proporiton of the sentiments according to the months")
  ax2.legend()

  plt.tight_layout()
  plt.show()

## Predictive Analysis

###Preparation of the dataset. Version 1

In [None]:
import pandas as pd

btc = pd.read_csv("/content/bitcoin_articles_with_finbert_sentiment.csv")
btc.head()

In [None]:
btc.info()

In [None]:
summary_df = btc.groupby(['date', 'finbert_sentiment']).size().unstack(fill_value=0).reset_index()
sentiment_part = summary_df[["date", "negative", "neutral", "positive"]]
sentiment_part.head()

In [None]:
sentiment_part["date"] = pd.to_datetime(sentiment_part["date"])

In [None]:
price_part = pd.read_csv("BTC.csv")
price_part.head()

In [None]:
price_part["date"] = pd.to_datetime(price_part["date"])

In [None]:
merged_df = pd.merge(sentiment_part,price_part, on="date")
merged_df

In [None]:
columns_to_drop = ["ticker"]
merged_df.drop(columns=columns_to_drop, inplace=True)
merged_df

Finding longest consuequent number of days with sentiment data

In [None]:
# Sorting values by date
merged_df = merged_df.sort_values(by = "date").reset_index(drop=True)

# Finding gap days between dates
merged_df['gap_days'] = merged_df['date'].diff().dt.days

# Labeling consecutive segments
merged_df['segment_id'] = (merged_df['gap_days'] != 1).cumsum()

# Finding segments with most rows
segment_sizes = merged_df.groupby('segment_id').size()
longest_segment_id = segment_sizes.idxmax()

# Get that entire consecutive segment
longest_segment_df = merged_df[merged_df['segment_id'] == longest_segment_id].copy()

# Drop gap_days and segment_id
longest_segment_df.drop(['gap_days', 'segment_id'], axis=1, inplace=True)

longest_segment_df

In [None]:
longest_segment_df.to_csv("btc_sentiment_ohlcv.csv")

### Preparation Dataset. Version2

### Model creation and training

Pipeline before fitting into the model

In [None]:
# Creating target variable
longest_segment_df["close_shift"] = longest_segment_df["close"].shift(-1)
longest_segment_df["target"] = (longest_segment_df["close_shift"] > longest_segment_df["close"]).astype(int)
longest_segment_df.dropna(inplace=True)

In [None]:
# Splitting data for scaling to avoid leaking future information
train_data = longest_segment_df.iloc[:int(0.8 * len(longest_segment_df))]
test_data = longest_segment_df.iloc[int(0.8 * len(longest_segment_df)):]

Scaling model using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

price_columns = ['open', 'high', 'low', 'close']
volume_columns = ['volume']

price_scaler = MinMaxScaler(feature_range=(0, 1))
volume_scaler = MinMaxScaler(feature_range=(0, 1))

# Fitting scaler on the train data
price_scaler.fit(train_data[price_columns])
volume_scaler.fit(train_data[volume_columns])

# Applying each scaler for train and test datasets
train_data[price_columns] = price_scaler.transform(train_data[price_columns])
train_data[volume_columns] = volume_scaler.transform(train_data[volume_columns])

test_data[price_columns] = price_scaler.transform(test_data[price_columns])
test_data[volume_columns] = volume_scaler.transform(test_data[volume_columns])

In [None]:
train_data.drop("close_shift", axis=1, inplace=True)
test_data.drop("close_shift", axis=1, inplace=True)

In [None]:
train_data

In [None]:
test_data.describe()

Creating sequence with definit window size

In [None]:
import numpy as np

def create_sequences(data, feature_cols, target_col, window_size=60):
    X, y = [], []

    for i in range(len(data) - window_size):
        # Range of data for this window
        seq_x = data[feature_cols].iloc[i:i+window_size].values
        # Target is the "day after the window"
        seq_y = data[target_col].iloc[i+window_size]

        X.append(seq_x)
        y.append(seq_y)

    return np.array(X), np.array(y)

In [None]:


import matplotlib.pyplot as plt
import seaborn as sns

# Assuming y_train and y_test are defined from the previous code
# Example:
# y_train = ...
# y_test = ...

plt.figure(figsize=(10, 6))
sns.histplot(y_train, kde=True, label='y_train')
sns.histplot(y_test, kde=True, label='y_test', color='orange')
plt.xlabel('Target Variable')
plt.ylabel('Frequency')
plt.title('Distribution of Target Variable (Train vs Test)')
plt.legend()
plt.show()


Creating model architecture

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

def create_model(X_train):
  model = Sequential()

  # Model layers
  model.add(LSTM(units=128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
  model.add(LSTM(units=64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
  Dropout(0.2)
  model.add(LSTM(units=32, return_sequences=False))
  model.add(Dense(units=64, activation="relu"))
  Dropout(0.2)
  model.add(Dense(units=32, activation="relu"))
  model.add(Dense(units=1, activation="sigmoid"))

  model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

  return model


Training loop for different combinations

In [None]:
window_sizes = [16, 32, 64, 128]
batch_sizes = [32, 64, 128]
results = pd.DataFrame(columns=["model", "window_size", "batch_size", "train_accuracy", "val_accuracy", "X_test", "y_test"])

for window_size in window_sizes:
  for batch_size in batch_sizes:

    # Creation of the sequences according to window size
    X_train, y_train = create_sequences(
        data=train_data,
        feature_cols=feature_cols,
        target_col=target_col,
        window_size=window_size
    )
    X_test, y_test = create_sequences(
        data=test_data,
        feature_cols=feature_cols,
        target_col=target_col,
        window_size=window_size
    )

    # Model creation with different split data
    model = create_model(X_train)

    # Training configuration with different batch sizes
    history = model.fit(
        X_train, y_train,
        epochs=30,
        batch_size=batch_size,
        shuffle=False,
        validation_split=0.1
    )

    # Saving results
    train_acc = max(history.history["accuracy"])
    val_acc   = max(history.history["val_accuracy"])

    row_dict = {
            "model": model,
            "window_size": window_size,
            "batch_size": batch_size,
            "train_accuracy": train_acc,
            "val_accuracy": val_acc,
            "X_test" : X_test,
            "y_test" : y_test
        }

    # Creating row dataframe
    temp_df = pd.DataFrame([row_dict])

    # Concatenate with the main results DataFrame
    results = pd.concat([results, temp_df], ignore_index=True)


In [None]:
results

Evaluation

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

results["final_accuracy"] = None
from sklearn.metrics import accuracy_score

def compute_accuracy(model, X_test, y_test):

    # Predicts probability
    y_prob = model.predict(X_test)  # shape: (num_samples, 1)

    # If you need actual class labels (0 or 1):
    y_pred = (y_prob >= 0.5).astype(int)
    # Calculate accuracy
    return accuracy_score(y_test, y_pred)

for idx, row in results.iterrows():
    # Grab the trained model from the row
    model = row['model']

    X_test = results["X_test"][idx]
    y_test = results["y_test"][idx]
    # Compute accuracy
    acc = compute_accuracy(model, X_test, y_test)

    # Save to 'final_accuracy' column
    results.at[idx, 'final_accuracy'] = acc

results_highest = results.sort_values(by="final_accuracy", ascending=False)
results_highest