In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import re
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import warnings
from transformers import pipeline, AutoTokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Project


# Dataset Cleaning Tweets


In [3]:
# Define columns you want to keep
columns_to_keep = ['date', 'text']

# Create an empty CSV to save the cleaned version
output_file = 'bitcoin_tweets_date_text.csv'
first_chunk = True

for chunk in pd.read_csv('/Users/farazmustafa/Documents/Git/Crypto Project/Capstone/Capstone Project/Bitcoin_tweets.csv', chunksize=100000, engine='python'):  
    chunk = chunk[columns_to_keep]
    chunk.to_csv(output_file, mode='a', index=False, header=first_chunk)
    first_chunk = False

In [4]:
df_raw = pd.read_csv("/Users/farazmustafa/Documents/Git/Crypto Project/Capstone/Capstone Project/bitcoin_tweets_date_text.csv")

In [5]:
df_raw.head()

Unnamed: 0,date,text
0,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...
1,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""..."
2,2021-02-10 23:54:48,"Guys evening, I have read this article about B..."
3,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...
4,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...


In [6]:
# clean df 
df_raw = df_raw.sort_values(by = 'date')
dd = df_raw.sample(frac=0.01, replace=False, random_state=1)
dd.reset_index(inplace=True)
for i,s in enumerate(tqdm(dd['text'],position=0, leave=True)):
    text = str(dd.loc[i, 'text'])
    text = text.replace("#", "")
    text = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', text, flags=re.MULTILINE)
    text = re.sub('@\\w+ *', '', text, flags=re.MULTILINE)
    dd.loc[i, 'text'] = text
# f = open(tweets_clean_file, 'a+', encoding='utf-8')
tweets_clean_file = '/Users/farazmustafa/Documents/Git/Crypto Project/Capstone/Capstone Project/Bitcoin_tweets_clean.csv'
dd.to_csv(tweets_clean_file, header=True, encoding='utf-8',index=False)

100%|██████████| 147169/147169 [07:53<00:00, 310.70it/s]


In [7]:
df_clean = pd.read_csv('/Users/farazmustafa/Documents/Git/Crypto Project/Capstone/Capstone Project/Bitcoin_tweets_clean.csv')

In [8]:
df_clean.head()

Unnamed: 0,index,date,text
0,5337286,2021-02-09 06:48:00,I wonder which company will be able to earn an...
1,11219996,2021-08-25 15:40:11,"✅With solar energy comes BTC, Eth $doge mining..."
2,9662586,2022-10-11 13:14:02,⚡️There’s a start of Bitcoin’s custody today i...
3,5891757,2021-07-18 04:40:49,"Frankly, irrelevant to Bitcoin"
4,2764358,2022-04-16 02:49:02,noticiacrypto criptomoneda crypto newscrypto b...


In [9]:
from textblob import TextBlob

df = df_clean.copy()
df = df[['text']] 
df.columns = ['tweets']
df.head()

Unnamed: 0,tweets
0,I wonder which company will be able to earn an...
1,"✅With solar energy comes BTC, Eth $doge mining..."
2,⚡️There’s a start of Bitcoin’s custody today i...
3,"Frankly, irrelevant to Bitcoin"
4,noticiacrypto criptomoneda crypto newscrypto b...


In [10]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = nltk.corpus.stopwords.words(['english'])

print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/farazmustafa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/farazmustafa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/farazmustafa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

def cleaning(data):
    #remove urls
    tweet_without_url = re.sub(r'http\S+',' ', data)

    #remove hashtags
    tweet_without_hashtag = re.sub(r'#\w+', ' ', tweet_without_url)

    #3. Remove mentions and characters that not in the English alphabets
    tweet_without_mentions = re.sub(r'@\w+',' ', tweet_without_hashtag)
    precleaned_tweet = re.sub('[^A-Za-z]+', ' ', tweet_without_mentions)

    #2. Tokenize
    tweet_tokens = TweetTokenizer().tokenize(precleaned_tweet)

    #3. Remove Puncs
    tokens_without_punc = [w for w in tweet_tokens if w.isalpha()]

    #4. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]

    #5. lemma
    text_cleaned = [lem.lemmatize(t) for t in tokens_without_sw]

    #6. Joining
    return " ".join(text_cleaned)

In [12]:
# Clean missing tweets
df['tweets'] = df['tweets'].fillna("")

# Clean tweets
df['cleaned_tweets'] = df['tweets'].apply(cleaning)

# Set date properly
df['date'] = df_clean['date']   # WARNING: make sure df_clean has no weird data
df['date_clean'] = pd.to_datetime(df['date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Drop bad rows where date_clean is NaT
df = df.dropna(subset=['date_clean'])

# Drop old 'date' column
df.drop(columns='date', inplace=True)

# Show result
df.head()


Unnamed: 0,tweets,cleaned_tweets,date_clean
0,I wonder which company will be able to earn an...,I wonder company able earn hold number bitcoin...,2021-02-09
1,"✅With solar energy comes BTC, Eth $doge mining...",With solar energy come BTC Eth doge mining Dec...,2021-08-25
2,⚡️There’s a start of Bitcoin’s custody today i...,There start Bitcoin custody today Bank New Yor...,2022-10-11
3,"Frankly, irrelevant to Bitcoin",Frankly irrelevant Bitcoin,2021-07-18
4,noticiacrypto criptomoneda crypto newscrypto b...,noticiacrypto criptomoneda crypto newscrypto b...,2022-04-16


### EDA-Tweets

In [13]:
# Create daily tweet aggregation with cleaned data
daily_tweets = df.groupby('date_clean').size().reset_index(name='tweet_count')

# Print daily tweet statistics
print("Daily tweet statistics:")
print(daily_tweets['tweet_count'].describe())

Daily tweet statistics:
count     222.000000
mean      662.324324
std       475.076963
min         2.000000
25%       201.750000
50%       580.000000
75%       985.750000
max      2240.000000
Name: tweet_count, dtype: float64


In [14]:
print(df['date_clean'].min())
print(df['date_clean'].max())


2021-02-05
2023-01-09


# Data Bitcoin Prices

In [15]:
import yfinance as yf
import pandas as pd

# Fetch historical Bitcoin prices
def get_btc_prices(start_date='2021-02-05', end_date='2023-01-09'):
    btc = yf.Ticker("BTC-USD")
    df = btc.history(start=start_date, end=end_date)
    return df[['Close', 'Volume']].rename(columns={'Close': 'price'})

# Sentiment Analysis

In [16]:
tokenizer = AutoTokenizer.from_pretrained('ElKulako/cryptobert')
model = AutoModelForSequenceClassification.from_pretrained('ElKulako/cryptobert')
sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

def batch_sentiment(texts, batch_size=32):
    sentiments = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        results = sentiment_pipeline(batch)
        sentiments.extend(results)
    return sentiments

Device set to use mps:0


In [17]:
# Apply sentiment analysis
sentiments = batch_sentiment(df['cleaned_tweets'].tolist())
df['sentiment'] = [s['label'] for s in sentiments]
df['sentiment_score'] = [s['score'] for s in sentiments]

In [63]:
df.head()

Unnamed: 0,tweets,cleaned_tweets,date_clean,sentiment,sentiment_score,price_ma7
0,I wonder which company will be able to earn an...,I wonder company able earn hold number bitcoin...,2021-02-09,Neutral,0.659565,
1,"✅With solar energy comes BTC, Eth $doge mining...",With solar energy come BTC Eth doge mining Dec...,2021-08-25,Neutral,0.716935,
2,⚡️There’s a start of Bitcoin’s custody today i...,There start Bitcoin custody today Bank New Yor...,2022-10-11,Neutral,0.512339,
3,"Frankly, irrelevant to Bitcoin",Frankly irrelevant Bitcoin,2021-07-18,Bearish,0.67809,
4,noticiacrypto criptomoneda crypto newscrypto b...,noticiacrypto criptomoneda crypto newscrypto b...,2022-04-16,Neutral,0.826197,


## Data Integration

In [60]:
# Get price data
price_df = get_btc_prices()

# Convert price_df index to naive datetime first
price_df.index = price_df.index.tz_localize(None)

# Then convert index to string
price_df.index = price_df.index.strftime('%Y-%m-%d')

# Merge datasets
merged_df = pd.merge(
    df,
    price_df,
    left_on='date_clean',  # date_clean is a string, price_df index is now a string too
    right_index=True,
    how='inner'
)

# Create target variable (next day's price movement)
merged_df['price_change'] = merged_df['price'].pct_change().shift(-1)


In [61]:
print(merged_df.columns)

Index(['tweets', 'cleaned_tweets', 'date_clean', 'sentiment',
       'sentiment_score', 'price_ma7', 'price', 'Volume', 'price_change'],
      dtype='object')


In [66]:
merged_df.head()

Unnamed: 0,cleaned_tweets,date_clean,sentiment,sentiment_score,price,Volume,price_change
0,I wonder company able earn hold number bitcoin...,2021-02-09,Neutral,0.659565,46481.105469,91809846886,0.053348
1,With solar energy come BTC Eth doge mining Dec...,2021-08-25,Neutral,0.716935,48960.789062,32646349931,-0.610884
2,There start Bitcoin custody today Bank New Yor...,2022-10-11,Neutral,0.512339,19051.417969,28711532910,0.669
3,Frankly irrelevant Bitcoin,2021-07-18,Bearish,0.67809,31796.810547,18787986667,0.271338
4,noticiacrypto criptomoneda crypto newscrypto b...,2022-04-16,Neutral,0.826197,40424.484375,16833150693,0.606699


In [64]:
merged_df.drop(columns=['tweets','price_ma7'], inplace= True)

In [65]:
merged_df.head()

Unnamed: 0,cleaned_tweets,date_clean,sentiment,sentiment_score,price,Volume,price_change
0,I wonder company able earn hold number bitcoin...,2021-02-09,Neutral,0.659565,46481.105469,91809846886,0.053348
1,With solar energy come BTC Eth doge mining Dec...,2021-08-25,Neutral,0.716935,48960.789062,32646349931,-0.610884
2,There start Bitcoin custody today Bank New Yor...,2022-10-11,Neutral,0.512339,19051.417969,28711532910,0.669
3,Frankly irrelevant Bitcoin,2021-07-18,Bearish,0.67809,31796.810547,18787986667,0.271338
4,noticiacrypto criptomoneda crypto newscrypto b...,2022-04-16,Neutral,0.826197,40424.484375,16833150693,0.606699


### Trading Strategy

In [67]:
def trading_strategy(df, threshold=0.5):
    """Generate trading signals based on sentiment thresholds"""
    df['signal'] = 'HOLD'
    df.loc[df['sentiment_score'] > threshold, 'signal'] = 'BUY'  
    df.loc[df['sentiment_score'] < -threshold, 'signal'] = 'SELL'  
    return df
# Apply strategy
strategy_df = trading_strategy(merged_df.copy())

In [68]:
strategy_df.head()

Unnamed: 0,cleaned_tweets,date_clean,sentiment,sentiment_score,price,Volume,price_change,signal
0,I wonder company able earn hold number bitcoin...,2021-02-09,Neutral,0.659565,46481.105469,91809846886,0.053348,BUY
1,With solar energy come BTC Eth doge mining Dec...,2021-08-25,Neutral,0.716935,48960.789062,32646349931,-0.610884,BUY
2,There start Bitcoin custody today Bank New Yor...,2022-10-11,Neutral,0.512339,19051.417969,28711532910,0.669,BUY
3,Frankly irrelevant Bitcoin,2021-07-18,Bearish,0.67809,31796.810547,18787986667,0.271338,BUY
4,noticiacrypto criptomoneda crypto newscrypto b...,2022-04-16,Neutral,0.826197,40424.484375,16833150693,0.606699,BUY


## Model Training

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import TimeSeriesSplit