In [1]:
import pandas as pd
import swifter  # noqa: F401
import torch
from sklearn.ensemble import HistGradientBoostingClassifier
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu")

In [2]:
print(f"Using device: {device}")

Using device: cuda


In [3]:
MIN_COMMENTS_PER_SUBSET = 50
N_OF_SPLITS_PER_SUBSET = 50

In [4]:
embedding_exists = None
df_reddit = None

try:
    df_reddit = pd.read_parquet("../data/processed/reddit_posts_embedded.parquet")
    embedding_exists = True
    print("Embedding file found")
except FileNotFoundError:
    print("Embedding file not found, cleaning up and re-embedding")

Embedding file found


In [5]:
if not embedding_exists:
    df_reddit = pd.read_parquet("../data/processed/reddit_posts.parquet")
    df_reddit["created"] = pd.to_datetime(df_reddit["created"], format="%Y-%m-%d %H:%M:%S")
    df_reddit["edited"] = pd.to_datetime(df_reddit["edited"], format="%Y-%m-%d %H:%M:%S")

df_reddit.head(1)

Unnamed: 0,id,parent_id,author,body,created,depth,edited,score,search_query,subreddit,title,url,num_comments,test,fraud,embedding
0,yxu5tv,,magus-21,"Secretly lending customer funds, market-making...",2022-11-17 16:10:14,-1,NaT,1597,Safe Moon,r/CryptoCurrency,"""DYOR"" is worthless. You can't ""Do Your Own Re...",https://www.reddit.com/r/CryptoCurrency/commen...,634,True,True,"[-0.4334820508956909, -0.4628458321094513, -0...."


In [6]:
df_coins = pd.read_json("../data/raw/coins.json")
df_coins["start_date"] = pd.to_datetime(df_coins["start_date"], format="ISO8601")
df_coins["end_date"] = pd.to_datetime(df_coins["end_date"], format="ISO8601")
df_coins = df_coins.set_index("name")

df_coins.head(1)

Unnamed: 0_level_0,symbol,fraud,test,max_market_cap_e9,start_date,subreddits,end_date
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bitcoin,BTC,False,False,1800.0,2010-07-14,"[CryptoCurrency, CryptoMoonShots, CryptoMarket...",NaT


In [7]:
def get_number_of_comments_per_coin(name: str, cutoff_date: str) -> int:
    """
    Get the number of comments for a given coin.

    Args:
        name (str): The name of the coin.
        cutoff_date (str): The cutoff date for the comments.

    Returns:
        int: The number of comments for the given coin.

    """
    cutoff_date = pd.to_datetime(cutoff_date, format="%Y-%m-%d")
    return len(df_reddit[(df_reddit["search_query"] == name) & (df_reddit["created"] <= cutoff_date)])


def get_first_comment_date(name: str) -> str:
    """
    Get the first comment date for a given coin.

    Args:
        name (str): The name of the coin.

    Returns:
        str: The first comment date for the given coin.

    """
    return df_reddit[df_reddit["search_query"] == name]["created"].min()


def get_last_comment_date(name: str) -> str:
    """
    Get the last comment date for a given coin.

    Args:
        name (str): The name of the coin.

    Returns:
        str: The last comment date for the given coin.

    """
    mod_df_reddit = df_reddit.copy()
    mod_df_reddit["edited"] = mod_df_reddit["edited"].fillna(mod_df_reddit["created"])
    return mod_df_reddit[mod_df_reddit["search_query"] == name]["edited"].max()


def get_coin_info(name: str) -> pd.Series:
    """
    Get coin information for a given coin name.

    Args:
        name (str): The name of the coin.

    Returns:
        pd.Series: The coin information.

    """
    return df_coins.loc[name]


def get_coin_info_row(row: pd.Series) -> pd.Series:
    """
    Get coin information for a given row.

    Args:
        row (pd.Series): A row from the dataframe containing a search_query column.

    Returns:
        pd.Series: The coin information corresponding to the search_query.

    """
    return df_coins.loc[row.search_query]


def is_comment_valid_row(row: pd.Series) -> bool:
    """
    Check if a comment is valid based on the coin's start and end dates.

    Args:
        row (pd.Series): A row from the dataframe containing created and edited columns.

    Returns:
        bool: True if the comment is valid, False otherwise.

    """
    coin = get_coin_info_row(row)

    comment_date = row.edited if pd.notna(row.edited) else row.created
    start_date_valid = comment_date >= coin["start_date"]
    end_date_valid = comment_date <= coin["end_date"] if pd.notna(coin["end_date"]) else True

    return start_date_valid and end_date_valid

In [8]:
if not embedding_exists:
    df_reddit["test"] = df_reddit.swifter.apply(get_coin_info_row, axis=1)["test"]
    df_reddit["fraud"] = df_reddit.swifter.apply(get_coin_info_row, axis=1)["fraud"]
    df_reddit["valid"] = df_reddit.swifter.apply(is_comment_valid_row, axis=1)

    df_reddit = df_reddit[df_reddit["valid"]]
    df_reddit = df_reddit.drop(columns=["valid"])
    df_reddit = df_reddit.reset_index(drop=True)

    df_reddit.head(1)

In [9]:
df_reddit["search_query"].value_counts()

search_query
Bitcoin       142163
Chainlink      72875
Ethereum       72409
Safe Moon      69352
Cosmos         57082
Avalanche      35673
FTX Token      21656
THORChain      18772
Terra Luna      8683
BitForex        2611
BeerCoin         805
Name: count, dtype: Int64

In [10]:
if not embedding_exists:
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    bert_embedder = BertModel.from_pretrained("bert-base-uncased").to(device)

    # Pre-fetch texts as numpy array for efficient slicing
    texts = df_reddit["body"].values
    embeddings = []

    EMBEDDING_BATCH_SIZE = 2**7
    print(f"Embedding batch size: {EMBEDDING_BATCH_SIZE}")

    # Process batches
    for i in tqdm(range(0, len(texts), EMBEDDING_BATCH_SIZE)):
        batch_texts = texts[i : i + EMBEDDING_BATCH_SIZE]
        with torch.no_grad():
            tokens = bert_tokenizer.batch_encode_plus(
                batch_texts,
                padding=True,
                truncation=True,
                return_tensors="pt",
                add_special_tokens=True,
            ).to(device)

            outputs = bert_embedder(**tokens)["pooler_output"].cpu()
            embeddings.extend(outputs.tolist())

    df_reddit["embedding"] = embeddings


In [11]:
if not embedding_exists:
    df_reddit.to_parquet("../data/processed/reddit_posts_embedded.parquet")


In [None]:
train_df = df_reddit[~df_reddit["test"]]
test_df = df_reddit[df_reddit["test"]]

In [None]:
for i, val_coin in enumerate(train_df["search_query"].unique()):
    # Get the fitting and validation splits
    fit_df = train_df[train_df["search_query"] != val_coin]
    val_df = train_df[train_df["search_query"] == val_coin]

    # HistGradientBoostingClassifier
    model = HistGradientBoostingClassifier(max_iter=1000, random_state=42)
    model.fit(fit_df["embedding"].to_list(), fit_df["fraud"])

    # Predict the validation set
    predictions = model.predict(val_df["embedding"].to_list())

    # Calculate the accuracy
    accuracy = (predictions == val_df["fraud"]).mean()

    print(f"Coin left out: {val_coin}, Accuracy: {accuracy:.2f}")
    print()

In [None]:
df_reddit["embedding"].isna().sum()
