1. Import all libraries

In [12]:
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

tqdm.pandas()

In [9]:
MODEL_NAME = "yiyanghkust/finbert-tone"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



2. Import the news dataset

In [4]:
input_file_path = "/Users/eric_p/Desktop/Fall 2025/MFIN 7036/Group Project/Text_Data/BTC_match_text.csv"
source_data = pd.read_csv(input_file_path)

3. Extract the columns that will be passed into FinBert

In [5]:
df = source_data[["date_time", "title", "article_text"]].copy()
df.dropna(inplace = True)

df["date_time"] = pd.to_datetime(df["date_time"])
df["date"] = df["date_time"].dt.date

4. Clean the text extracted 

In [6]:
def clean_text(text):
    if not isinstance(df, str):
        return ""

    # lower case
    text = text.lower()
    
    # remove numbered lists at beginning of lines
    text = re.sub(r"\n?\d+\.\s+.*", "", text)

    # normalize whitespace
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

4. Truncate text tokens from the end of the text (with a max limit)

In [10]:
def truncate_from_end(text, tokenizer, max_tokens=400):
    tokens = tokenizer.tokenize(text)
    if len(tokens) <= max_tokens:
        return text
    tokens = tokens[-max_tokens:]
    return tokenizer.convert_tokens_to_string(tokens)

5. Construct FinBert input

In [None]:
def build_finbert_text(row, tokenizer):
    title = row["title"] if isinstance(row["title"], str) else ""
    body = clean_text(row["article_text"])
    body = truncate_from_end(body, tokenizer, max_tokens=400)
    return title + " [SEP] " + body

df["finbert_text"] = df.progress_apply(
    lambda x: build_finbert_text(x, tokenizer),
    axis=1
)

100%|██████████| 99400/99400 [00:03<00:00, 31222.73it/s]


6. Store FinBert input data

In [None]:
output_file_path = "/Users/eric_p/Desktop/Fall 2025/MFIN 7036/Group Project/Text_Data/Finbert_input/finbert_input.csv"
df.to_csv(output_file_path, index = False)