In [65]:
import pandas as pd
from functools import partial
from typing import List, Callable
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve, auc

In [62]:
def pandas_data_loader(addr: str, columns: List[str], *transforms: Callable[[pd.DataFrame], pd.DataFrame]) -> pd.DataFrame:
    # Load the data from the CSV file
    df = pd.read_csv(addr, usecols=columns)

    # Apply each transform to the DataFrame
    for transform in transforms:
        df = transform(df)

    return df

# Transform index to datetime
def index_to_datetime(df, unit="s"):
    df.index = pd.to_datetime(df.index, unit=unit)
    return df
# Transform col to index
to_index = lambda col, df: df.set_index(col)
# Rename text_plit to text
rename = lambda original, new, df: df.rename(columns={original: new})

def compute_metrics_classification(labels, preds, probs, metrics_to_return=None):
        """
        Compute classification metrics based on the model's predictions and the true labels.

        Args:
        labels (any): The true labels.
        preds (any): The model's predictions.
        probs (any): The model's probabilities
        metrics_to_return (list): List of metric names to compute and return.

        Returns:
        dict: The computed classification metrics.
        """
        if metrics_to_return is None:
            metrics_to_return = ["accuracy", "f1", "precision", "recall", "roc_score", "confusion_matrix"]

        metrics = {}

        if "precision" in metrics_to_return or "recall" in metrics_to_return or "f1" in metrics_to_return:
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
            if "precision" in metrics_to_return:
                metrics["precision"] = precision
            if "recall" in metrics_to_return:
                metrics["recall"] = recall
            if "f1" in metrics_to_return:
                metrics["f1"] = f1

        if "accuracy" in metrics_to_return:
            metrics["accuracy"] = accuracy_score(labels, preds)

        if "roc_score" in metrics_to_return:
            metrics["roc_score"] = roc_auc_score(labels, probs, multi_class='ovr')

        if "confusion_matrix" in metrics_to_return:
            metrics["confusion_matrix"] = confusion_matrix(labels, preds)

        return metrics


In [None]:
# Initialize an empty DataFrame to store the sampled data
sampled_tweets = pd.DataFrame()

# Process the data in chunks with error handling
for i, chunk in enumerate(pd.read_csv('../raw/Bitcoin_tweets.csv', usecols=['text', 'date'], chunksize=500000, on_bad_lines='error', encoding='utf-8', low_memory=True)):
    # Convert 'date' column to datetime, coercing errors
    chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
    
    # Drop rows with invalid dates
    chunk = chunk.dropna(subset=['date'])
    
    # Convert 'text' column to string and filter out tweets with fewer than 5 words
    chunk['text'] = chunk['text'].astype(str)
    chunk = chunk[chunk['text'].str.split().str.len() > 4]
    
    # Sample 5000 tweets per day from this chunk
    sampled_chunk = chunk.groupby(chunk['date'].dt.date).apply(
        lambda x: x.sample(n=min(5000, len(x)), random_state=1)
    ).reset_index(drop=True)
    
    # Append the sampled data to the main DataFrame
    sampled_tweets = pd.concat([sampled_tweets, sampled_chunk], ignore_index=True)

In [6]:
sampled_tweets

Unnamed: 0,date,text
0,2021-02-05 16:35:13,In case you want to seel your btc please DM as...
1,2021-02-05 17:12:24,🔄 Prices update in $USD (1 hour):\n\n$BTC - 37...
2,2021-02-05 17:30:00,Bitcoin fiat price in last day https://t.co/Bf...
3,2021-02-05 17:48:34,SIGN UP FOR #COINTIPLY !!\n\n- Claim hourly #b...
4,2021-02-05 21:49:40,#blockchain Innovation Done the Polkadot Way A...
...,...,...
1083222,2022-12-27 23:42:49,Binance is crappy\n\n😤😾😾😤😠\n$BUSD #Bitcoin #BT...
1083223,2022-12-27 23:47:03,What is Currency? Exchange Rates | Money Instr...
1083224,2022-12-27 23:53:14,I'm playing #lnbingo! This card is for draw 76...
1083225,2022-12-27 23:46:45,Took a while but still less work than opening ...


In [None]:
# Process the data in chunks with error handling
for i, chunk in enumerate(pd.read_csv('../raw/Bitcoin_tweets_dataset_2.csv', usecols=['text', 'date'], chunksize=1000, on_bad_lines='error', encoding='utf-8', low_memory=True)):
    # Convert 'date' column to datetime, coercing errors
    chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
    
    # Drop rows with invalid dates
    chunk = chunk.dropna(subset=['date'])
    
    # Convert 'text' column to string and filter out tweets with fewer than 5 words
    chunk['text'] = chunk['text'].astype(str)
    chunk = chunk[chunk['text'].str.split().str.len() > 4]
    
    # Sample 5000 tweets per day from this chunk
    sampled_chunk = chunk.groupby(chunk['date'].dt.date).apply(
        lambda x: x.sample(n=min(5000, len(x)), random_state=1)
    ).reset_index(drop=True)
    
    # Append the sampled data to the main DataFrame
    sampled_tweets = pd.concat([sampled_tweets, sampled_chunk], ignore_index=True)
# Save the sampled data to a new CSV file
sampled_tweets.to_csv('sampled_tweets_per_day.csv', index=False)

In [8]:
sampled_tweets.groupby(sampled_tweets['date'].dt.date).count()

Unnamed: 0_level_0,date,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-02-05,1694,1694
2021-02-06,3273,3273
2021-02-07,3027,3027
2021-02-08,5000,5000
2021-02-09,4344,4344
...,...,...
2022-11-16,8959,8959
2022-11-20,5000,5000
2022-11-21,5000,5000
2022-12-27,235,235


In [4]:
import re
import string
import emoji
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
import pandas as pd
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/hamid/nltk_data...


True

In [52]:
class TweetPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.ads_keywords = ["buy", "discount", "promo", "sale", "airdrop", "giveaway"]

    def preprocess_with_lemmatization(self, text):
        text_list = text.split()
        text_list = [self.lemmatizer.lemmatize(word) for word in text_list]
        return " ".join(text_list)

    def remove_URL(self, text):
        return re.sub(r"(?:https?://|www\.)\S+\.\S+", "", text)

    def lowercase_tweet(self, text):
        return text.lower()

    def remove_punctuations(self, text):
        exclude = set(string.punctuation)
        for char in ['!', '?', '%', '$', '&']:
            exclude.remove(char)
        return ''.join(ch for ch in text if ch not in exclude)

    def replace_with_BTC(self, text):
        return re.sub(r"Bitcoin|bitcoin|btc|BitCoin", "BTC", text)

    def remove_user_ids(self, text):
        return re.sub(r"@\w+", " ", text)

    def remove_emojis_and_symbols(self, text):
        # Remove emojis and other non-alphanumeric symbols
        return re.sub(r"[^\w\s.,!?%&$]", " ", text)

    def remove_currency_numbers(self, text):
        # Remove numbers with $ or % symbols that likely represent prices/percentages
        text = re.sub(r"\$\s*\d+(?:\.\d+)?", " ", text)  # Remove prices like "$123" or "$ 123.45"
        text = re.sub(r"\b\d+(?:\.\d+)?\s*%", " ", text)  # Remove percentages like "5%" or "123.45 %"
        return text

    def remove_standalone_numbers(self, text):
        # Remove standalone numbers (e.g., 378500), but keep numbers with % or $ for potential context
        return re.sub(r"\b\d+\b", " ", text)

    def is_ads(self, text):
        for ads_keyword in self.ads_keywords:
            if ads_keyword in text:
                return True
        return False

    def preprocess_text(self, text):
        text = self.remove_URL(text)
        text = self.lowercase_tweet(text)
        text = self.remove_emojis_and_symbols(text)
        text = self.remove_currency_numbers(text)
        text = self.remove_standalone_numbers(text)
        text = self.remove_punctuations(text)
        text = self.replace_with_BTC(text)
        text = self.remove_user_ids(text)
        text = self.preprocess_with_lemmatization(text)
        return text

    def preprocess_tweets_df(self, tweets_df):
        tweets_df['cleaned_text'] = tweets_df['text'].apply(self.preprocess_text)
        tweets_df = tweets_df[~tweets_df['cleaned_text'].apply(self.is_ads)]
        return tweets_df

preprocessor = TweetPreprocessor()
cleaned_tweets_df = preprocessor.preprocess_tweets_df(sampled_tweets)

In [53]:
cleaned_tweets_df = cleaned_tweets_df[["date", "cleaned_text"]]

In [54]:
cleaned_tweets_df.date.dt.date

0          2021-02-05
1          2021-02-05
2          2021-02-05
3          2021-02-05
4          2021-02-05
              ...    
1090793    2023-03-01
1090794    2023-03-01
1090795    2023-03-01
1090796    2023-03-01
1090797    2023-03-01
Name: date, Length: 908235, dtype: object

In [55]:
from datetime import datetime
import pandas as pd

def combine_tweets(tweets_df):
    combined_data = []
    overlap = 20  # Number of words to overlap
    chunk_size = 200  # Target number of words per chunk
    
    current_text = []
    current_date = None
    
    for index, row in tweets_df.iterrows():
        tweet_text = row['cleaned_text']
        tweet_date = row['date'].strftime("%Y-%m-%d")  # Get only the date without time

        # Split tweet text into words
        words = tweet_text.split()
        
        # Append words to the current chunk
        current_text.extend(words)
        
        # If current_date is None, set it to the date of the first tweet in the chunk
        if current_date is None:
            current_date = tweet_date

        # Process the current chunk if it reaches the chunk size
        while len(current_text) >= chunk_size:
            # Create a chunk of the first 200 words
            chunk = " ".join(current_text[:chunk_size])
            
            # Append the chunk with its date to the combined data
            combined_data.append({"date": current_date, "text": chunk})
            
            # Remove the first 180 words, leaving a 20-word overlap for the next chunk
            current_text = current_text[chunk_size - overlap:]
            
            # Reset the date for the next chunk to the last tweet's date in the current chunk
            current_date = tweet_date

    # Handle any remaining words in the last chunk if it's not empty
    if current_text:
        combined_data.append({"date": current_date, "text": " ".join(current_text)})

    # Create a new DataFrame with the combined data
    combined_tweets_df = pd.DataFrame(combined_data)
    return combined_tweets_df

In [56]:
combined_tweets_df = combine_tweets(cleaned_tweets_df)

In [59]:
combined_tweets_df.text.iloc[5]

'out BTC BTC cryptocurrency price update in $eur hour $BTC $eth $xrp this year we are introducing workshop at bitblockboom guy swann will host making sense of lightning another in last minute $knc is still looking good BTC BTC dogecointothemoon kleverio kucoincom check this out! $klv is a great project $klv already tradable at world s no cryptocurrency mirBTC mcnicollme yeah right? now a day any conversation that mention BTC immediately catch me BTC BTC price update in $usd hour $BTC $eth $xrp crypto trading? BTC binance BTC crypto BTC will be inevitable for every country BTC cryptocurrency market crypto BTC cryptocurrency blockchain BTC ethereum forex real research app instant payment live withdraw payment proof watch till the end my answer to is learning blockchain technology good for the future? future blockchain the first digital cryptocurrency you can mine on your phone! pi make crypto mining easy and free! watch the vi $omg usd $BTC BTC BTC altcoins crypto michaelsaylor hour avai

In [60]:
combined_tweets_df.to_csv("../raw/combined_2021_to_2023.csv")

In [22]:
combined_tweets_df = pd.read_csv("../raw/combined_2021_to_2023.csv")

In [27]:
majority_votes = pd.read_csv("../raw/majority_prediction_of_2021_2023.csv", names=["id", "prediction"], usecols=["prediction"], skiprows=1)

In [29]:
predicted_df = pd.concat([combined_tweets_df, majority_votes], axis=1)

In [32]:
predicted_df = predicted_df[["date", "text", "prediction"]]

In [59]:
predicted_df.date = pd.to_datetime(predicted_df.date)
predicted_df.set_index("date", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predicted_df.date = pd.to_datetime(predicted_df.date)


In [47]:
address = "../raw/optimized_labeled.csv"
# Loading the price data
columns = ["timestamp", "close", "open", "high", "low", "volume", "label", "volatility", "window_start", "upper_barrier", "lower_barrier"]
labeled_df = pandas_data_loader(address, columns, partial(to_index, "timestamp"), partial(index_to_datetime, unit=None))
labeled_df.rename(inplace=True ,columns={"label": "previous_label"})
labeled_df = labeled_df.sort_index()
labeled_df.rename(columns={'label': 'previous_label'}, inplace=True)
# Shift the labels such that for each day, the label is set to the next day's label
labeled_df["next_day_label"] = labeled_df.previous_label.shift(-1)
labeled_df["next_day_window_start"] = labeled_df.window_start.shift(-1)
labeled_df.loc[labeled_df.iloc[0].name, 'next_day_window_start'] = True

In [60]:
merged_df = predicted_df.merge(
    labeled_df[["next_day_label", 'next_day_window_start', 'previous_label']], left_index=True, right_index=True, how="left"
)
merged_df.dropna(inplace=True)
merged_df.next_day_label.value_counts()

next_day_label
2.0    43221
0.0    38992
1.0    28342
Name: count, dtype: int64

In [67]:
merged_df

Unnamed: 0_level_0,text,prediction,next_day_label,next_day_window_start,previous_label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-02-05,in case you want to seel your BTC please dm as...,1,1.0,False,1
2021-02-05,free coin value BTC like cryptocurrency news B...,1,1.0,False,1
2021-02-05,to BTC news roundup for feb by market daily cr...,1,1.0,False,1
2021-02-05,late i m going to wait until it hit i ve got v...,0,1.0,False,1
2021-02-05,believe you show me a hedge fund that doesn t ...,1,1.0,False,1
...,...,...,...,...,...
2023-03-01,use the moneyprinter to paper over reality BTC...,0,0.0,False,0
2023-03-01,cryptonews investing eth BTCnews BTC wa tradin...,2,0.0,False,0
2023-03-01,nftart metaverse larvaapes opensea megapunks l...,1,0.0,False,0
2023-03-01,bnx are you looking for a way to recover your ...,0,0.0,False,0


In [68]:
compute_metrics_classification(merged_df.previous_label, merged_df.prediction, [], metrics_to_return=["accuracy"])

{'accuracy': 0.33551625887567277}