This works

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import math

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


This works

In [None]:
import pandas as pd

strux = pd.read_csv('/content/drive/MyDrive/DATA7001/strux_processed.csv')


This works

In [None]:
from tqdm import tqdm
tqdm.pandas()

def get_executives(participant_list):
    if not isinstance(participant_list, list):
        return []  # return empty list if not a valid list
    return [
        {'name': p.get('name'), 'designation': p.get('description')}
        for p in participant_list
        if isinstance(p, dict) and p.get('position') == 'Executive'
    ]

strux['executives'] = strux['participants'].progress_apply(get_executives)

100%|██████████| 11411/11411 [00:00<00:00, 533801.80it/s]


Aggregate prepared remarks & answers per executive

In [None]:
def build_role_texts_for_row(row):
    """
    Returns:
      prepared_by_role: dict {role: full text of prepared remarks}
      answers_by_role: dict {role: full text of all answers}
    """
    emap = {p["name"]: p["description"]
            for p in row.get("participants", [])
            if isinstance(p, dict) and p.get("position") == "Executive"}

    prepared = {}
    answers = {}

    # Process prepared remarks
    for item in row.get('prepared_remarks', []):
        if not isinstance(item, dict):
            continue  # skip malformed rows
        nm = item.get('name')
        text = join_speech(item.get('speech'))
        role = emap.get(nm)
        if role and text:
            prepared.setdefault(role, []).append(text)

    # Process Q&A answers
    for item in row.get('questions_and_answers', []):
        if not isinstance(item, dict):
            continue
        nm = item.get('name')
        text = join_speech(item.get('speech'))
        role = emap.get(nm)
        if role and text:
            answers.setdefault(role, []).append(text)

    # join all speeches per role
    return pd.Series(
        {r: " ".join(lst) for r,lst in prepared.items()},
        index=['prepared_by_role', 'answers_by_role']
    )

In [None]:
def join_speech(x):
    if not x:
        return ""
    if isinstance(x, list):
        return " ".join([str(s) for s in x])
    return str(x)

In [None]:
from tqdm import tqdm
tqdm.pandas()
strux[['prepared_by_role','answers_by_role']] = strux.progress_apply(build_role_texts_for_row, axis=1)

100%|██████████| 11411/11411 [00:37<00:00, 301.96it/s]


In [None]:
# Split long text into chunks (~512 tokens for transformer)
def chunk_text(text, chunk_size=400):
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

In [None]:
!pip install transformers

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load the pre-trained FinBERT tone model
tone_model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(tone_model_name)
model = AutoModelForSequenceClassification.from_pretrained(tone_model_name)
finbert_tone = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)  # 0 for GPU



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# Assign scores: Positive=1, Neutral=0, Negative=-1
def sentiment_score(text_chunks):
    scores = []
    for chunk in text_chunks:
        try:
            out = finbert_tone(chunk, truncation=True)
            label = out[0]['label']
            score = out[0]['score']
            if label == 'positive':
                scores.append(score)
            elif label == 'negative':
                scores.append(-score)
            else:
                scores.append(0)
        except:
            scores.append(0)
    if len(scores) == 0:
        return 0
    return np.mean(scores)

In [None]:
from tqdm import tqdm
tqdm.pandas()

def compute_sentiment_safe(role_dict):
    if not isinstance(role_dict, dict):
        return {}  # return empty dict if missing
    return {role: sentiment_score(chunk_text(txt)) for role, txt in role_dict.items()}

strux['prepared_sentiment'] = strux['prepared_by_role'].progress_apply(compute_sentiment_safe)
strux['answers_sentiment'] = strux['answers_by_role'].progress_apply(compute_sentiment_safe)

100%|██████████| 11411/11411 [00:00<00:00, 211568.35it/s]
100%|██████████| 11411/11411 [00:00<00:00, 209554.51it/s]


In [None]:
# Combine prepared + answers per row for overall call sentiment
def overall_call_sentiment(prep_dict, ans_dict):
    all_scores = list(prep_dict.values()) + list(ans_dict.values())
    if len(all_scores) == 0:
        return 0
    return np.mean(all_scores)

strux['overall_sentiment'] = strux.progress_apply(
    lambda r: overall_call_sentiment(r['prepared_sentiment'], r['answers_sentiment']), axis=1
)

100%|██████████| 11411/11411 [00:00<00:00, 107944.05it/s]


In [None]:
def sentiment_label(score):
    if score <= -0.6:
        return 1  # Negative
    elif score <= -0.2:
        return 2  # Slightly negative
    elif score < 0.2:
        return 3  # Neutral
    elif score < 0.6:
        return 4  # Slightly positive
    else:
        return 5  # Positive

strux['overall_sentiment_label'] = strux['overall_sentiment'].apply(sentiment_label)

In [None]:
def aggregate_sentiment(sent_dict):
    if not sent_dict:
        return 0  # neutral if empty
    scores = list(sent_dict.values())
    return sum(scores) / len(scores)

# Overall call sentiment
strux['overall_prepared_sentiment'] = strux['prepared_sentiment'].apply(aggregate_sentiment)
strux['overall_answers_sentiment'] = strux['answers_sentiment'].apply(aggregate_sentiment)

# Optional: combine prepared + answers for total call sentiment
strux['overall_call_sentiment'] = (strux['overall_prepared_sentiment'] + strux['overall_answers_sentiment']) / 2

In [None]:
import pandas as pd

# Extract list of tickers from strux
tickers = strux['ticker'].unique().tolist()
close_cols = [f"{t}_Close" for t in tickers if f"{t}_Close" in stock_data.columns]

# Keep only Close prices
stock_close = stock_data[close_cols].copy()

In [None]:
strux['date'] = pd.to_datetime(strux['date'])
stock_close.index = pd.to_datetime(stock_close.index)

# Optional: sort both
strux = strux.sort_values(['ticker','date']).reset_index(drop=True)
stock_close = stock_close.sort_index()

In [None]:
# Step 1: create a mapping from strux ticker -> stock column name
stock_cols = [c for c in stock_data.columns if c.endswith('_Close')]

# remove '_Close' to get the ticker part
stock_tickers = [c.replace('_Close','') for c in stock_cols]

# create a dictionary mapping strux ticker -> actual stock column
ticker_map = {}
for t in strux['ticker'].unique():
    matches = [col for col in stock_cols if col.startswith(t)]
    if matches:
        ticker_map[t] = matches[0]  # pick first match
    else:
        print(f"Warning: No matching stock column found for ticker {t}")

# Example: ticker_map might be {'A':'AEP_Close', ...}

# Step 2: compute returns using mapped column names
for horizon in [1,3,7]:
    for t, col_name in ticker_map.items():
        stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)

# Merge returns with strux
def get_return(row, horizon=1):
    col = f"{ticker_map.get(row['ticker'], None)}_return_{horizon}d"
    if col in stock_data.columns:
        return stock_data.loc[row['date'], col]
    else:
        return None

for horizon in [1,3,7]:
    strux[f'return_{horizon}d'] = strux.apply(lambda row: get_return(row, horizon), axis=1)



  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock_data[col_name].pct_change(periods=horizon).shift(-horizon)
  stock_data[f"{t}_return_{horizon}d"] = stock

In [None]:
def get_return(row, horizon=1):
    ticker_col = f"{row['ticker']}_return_{horizon}d"
    if ticker_col in stock_close.columns:
        return stock_close.loc[row['date'], ticker_col]
    else:
        return None

for horizon in [1, 3, 7]:
    strux[f'return_{horizon}d'] = strux.apply(lambda row: get_return(row, horizon), axis=1)

In [None]:
# Define safe sentiment mapping
def safe_sentiment(d):
    if not isinstance(d, dict):
        return {}  # empty dict for missing rows
    return {r: compute_sentiment(txt) for r, txt in d.items()}

# Apply to prepared remarks and answers
strux['prepared_sentiment'] = strux['prepared_by_role'].apply(safe_sentiment)
strux['answers_sentiment'] = strux['answers_by_role'].apply(safe_sentiment)

In [None]:
def aggregate_sent(sent_dict):
    return sum(sent_dict.values())/len(sent_dict) if sent_dict else 0

strux['overall_prepared_sentiment'] = strux['prepared_sentiment'].apply(aggregate_sent)
strux['overall_answers_sentiment'] = strux['answers_sentiment'].apply(aggregate_sent)
strux['overall_call_sentiment'] = (strux['overall_prepared_sentiment'] + strux['overall_answers_sentiment'])/2

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

# ---------------------------
# 1. Assume strux is your processed earnings call dataframe
# Columns: 'ticker', 'date', 'prepared_sentiment', 'answers_sentiment'
# 'prepared_sentiment' and 'answers_sentiment' are dicts: {role: score}
# We'll compute an overall numeric sentiment per row (call)
# ---------------------------

def compute_overall_sentiment(prep_dict, ans_dict):
    """
    Compute overall sentiment for a call by averaging across all executives.
    Assumes dict values are numeric scores (-1 to 1 or continuous)
    """
    all_scores = []
    for d in [prep_dict, ans_dict]:
        if isinstance(d, dict):
            for v in d.values():
                if v is not None:
                    all_scores.append(v)
    if not all_scores:
        return np.nan
    return np.mean(all_scores)

# Create overall sentiment column
strux['overall_sentiment'] = strux.apply(
    lambda r: compute_overall_sentiment(r.get('prepared_sentiment'), r.get('answers_sentiment')), axis=1
)

# ---------------------------
# 2. Compute stock returns
# ---------------------------
# Assume stock_df is your stock data with DateTime index and columns like 'A_Close', 'AEP_Close', etc.

stock_df = stock_data.sort_index()  # ensure sorted by date
valid_tickers = set(stock_tickers).intersection(set(strux['ticker']))
strux = strux[strux['ticker'].isin(valid_tickers)]

# For each ticker in strux, compute 1, 3, 7-day forward returns
def compute_forward_returns(ticker, dates, horizons=[1,3,7]):
    returns = pd.DataFrame(index=dates)
    close_col = f"{ticker}_Close"
    if close_col not in stock_df.columns:
        print(f"Warning: {close_col} not found in stock_df")
        return pd.DataFrame(index=dates)
    for h in horizons:
        returns[f'return_{h}d'] = (stock_df[close_col].shift(-h) - stock_df[close_col]) / stock_df[close_col]
    return returns

# Example: get all returns for all tickers
all_returns = []
for ticker in strux['ticker'].unique():
    ticker_dates = strux[strux['ticker']==ticker]['date']
    r = compute_forward_returns(ticker, ticker_dates)
    r['ticker'] = ticker
    r['date'] = r.index
    all_returns.append(r)
returns_df = pd.concat(all_returns, ignore_index=True)

# Merge returns with strux sentiment
df = pd.merge(
    strux[['ticker','date','overall_sentiment']],
    returns_df,
    on=['ticker','date'],
    how='left'
)

# ---------------------------
# 3. Compute correlations per horizon
# ---------------------------
horizons = [1,3,7]
for h in horizons:
    x = pd.to_numeric(df['overall_sentiment'], errors='coerce')
    y = pd.to_numeric(df[f'return_{h}d'], errors='coerce')
    mask = x.notna() & y.notna()
    if mask.sum() == 0:
        print(f"No data for horizon {h} days")
        continue
    corr, pval = pearsonr(x[mask], y[mask])
    print(f"Horizon {h} days: Pearson correlation = {corr:.4f}, p-value = {pval:.4g}")

No data for horizon 1 days
No data for horizon 3 days
No data for horizon 7 days


In [None]:
# Check ticker-column mapping
for ticker in strux['ticker'].unique():
    close_col = f"{ticker}_Close"
    if close_col not in stock_df.columns:
        print(f"Missing: {close_col}")

In [None]:
valid_tickers = [ticker for ticker in strux['ticker'].unique() if f"{ticker}_Close" in stock_df.columns]
strux = strux[strux['ticker'].isin(valid_tickers)]

In [None]:
print(strux['date'].min(), strux['date'].max())
print(stock_df.index.min(), stock_df.index.max())

2017-01-17 00:00:00 2024-08-21 00:00:00
1962-01-02 00:00:00 2024-12-30 00:00:00


In [None]:
max_horizon = 7
strux = strux[strux['date'] <= stock_df.index[-max_horizon]]

In [None]:
missing = []
for ticker in strux['ticker'].unique():
    col_name = f"{ticker}_Close"
    if col_name not in stock_df.columns:
        missing.append(col_name)
print("Missing tickers:", missing)

Missing tickers: []


In [None]:
# Check which strux dates are present in stock_df
strux_dates = strux['date'].dt.normalize()
missing_dates = strux_dates[~strux_dates.isin(stock_df.index)]
print("Number of strux dates missing in stock data:", len(missing_dates))
print(missing_dates)

Number of strux dates missing in stock data: 2
6239   2021-11-06
7271   2017-10-22
Name: date, dtype: datetime64[ns]


In [None]:
# Make sure stock_df index is sorted
stock_dates = stock_df.index.sort_values()

# Function to get nearest next trading day
def next_trading_day(d):
    future_dates = stock_dates[stock_dates >= d]
    if len(future_dates) > 0:
        return future_dates[0]
    return None  # no future date available

# Apply to strux
strux['aligned_date'] = strux['date'].apply(next_trading_day)

# Drop rows where aligned_date could not be found
strux = strux.dropna(subset=['aligned_date']).reset_index(drop=True)

In [None]:
print(strux['ticker'].unique())

['A' 'AAL' 'AAOI' 'AAON' 'AAPL' 'ABBV' 'ABCB' 'ABCL' 'ABEO' 'ABNB' 'ABOS'
 'ABSI' 'ABT' 'ABUS' 'ACAD' 'ACB' 'ACCD' 'ACGL' 'ACHC' 'ACIC' 'ACIW'
 'ACLS' 'ACMR' 'ACN' 'ACOR' 'ACTG' 'ACVA' 'ADAP' 'ADBE' 'ADEA' 'ADI' 'ADM'
 'ADMA' 'ADN' 'ADP' 'ADPT' 'ADSK' 'ADUS' 'ADV' 'ADVM' 'AEE' 'AEIS' 'AEP'
 'AES' 'AEYE' 'AFCG' 'AFIB' 'AFL' 'AFMD' 'AFRM' 'AFYA' 'AGEN' 'AGFY'
 'AGIO' 'AGNC' 'AGRX' 'AGYS' 'AHCO' 'AIG' 'AIRG' 'AIZ' 'AJG' 'AKAM' 'AKBA'
 'AKTS' 'AKYA' 'ALB' 'ALDX' 'ALGM' 'ALGN' 'ALGT' 'ALHC' 'ALIM' 'ALKS'
 'ALKT' 'ALL' 'ALLE' 'ALLO' 'ALLT' 'ALNT' 'ALNY' 'ALPN' 'ALRM' 'ALRS'
 'ALT' 'ALTO' 'ALTR' 'AMAL' 'AMAT' 'AMBA' 'AMCR' 'AMCX' 'AMD' 'AME' 'AMED'
 'AMGN' 'AMKR' 'AMP' 'AMPH' 'AMPL' 'AMRN' 'AMSC' 'AMSF' 'AMSWA' 'AMT'
 'AMWD' 'AMZN' 'ANDE' 'ANET' 'ANGI' 'ANGO' 'ANIK' 'ANIP' 'ANSS' 'AON'
 'AOS' 'AOUT' 'APA' 'APD' 'APEI' 'APH' 'API' 'APOG' 'APP' 'APPN' 'APPS'
 'APTO' 'APTV' 'APYX' 'AQMS' 'AQST' 'ARAY' 'ARBK' 'ARCB' 'ARCC' 'ARCT'
 'ARE' 'ARGX' 'ARLP' 'ARM' 'ARRY' 'ARWR' 'ASML' 'ASO' 'ASPS' 'ASRT'

In [None]:
stock_tickers = [col.split('_')[0] for col in stock_df.columns if col.endswith('_Close')]
print(stock_tickers)

['AEP', 'AIG', 'APOG', 'AVT', 'AVY', 'AXP', 'BA', 'BAC', 'BALL', 'BDX', 'BK', 'BMY', 'CAT', 'CL', 'CLX', 'CMI', 'CMS', 'CNP', 'CPB', 'CVS', 'CVX', 'DD', 'DE', 'DIS', 'DTE', 'ECL', 'EIX', 'EMR', 'ES', 'ETN', 'ETR', 'EVRG', 'EXC', 'F', 'GD', 'GE', 'GWW', 'HAL', 'HES', 'HON', 'HPQ', 'HUBB', 'IBM', 'IP', 'ITW', 'JNJ', 'K', 'KO', 'KR', 'LLY', 'LMT', 'MCD', 'MDT', 'MKC', 'MMC', 'MMM', 'MO', 'MRK', 'MSI', 'NEE', 'NI', 'PCG', 'PEP', 'PFE', 'PG', 'PNR', 'PNW', 'RTX', 'RVTY', 'SNA', 'SPGI', 'SYY', 'TGT', 'TXN', 'TXT', 'USB', 'VMC', 'WBA', 'WFC', 'WHR', 'WMT', 'WRB', 'WY', 'XEL', 'XOM']


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [None]:
import pandas as pd

# STRUX processed data (CSV)
strux = pd.read_csv("/content/drive/MyDrive/DATA7001/strux_processed.csv", parse_dates=['date'])

# Stock data (Pickle)
stock_df = pd.read_pickle("/content/drive/MyDrive/DATA7001/stock_processed.pkl")
stock_df.sort_index(inplace=True)

In [None]:
# Example for CEOs
strux['CEO_sentiment'] = strux['prepared_sentiment'].apply(lambda d: d.get('President and Chief Executive Officer', 0))
for horizon in [1, 3, 7]:
    corr, pval = pearsonr(strux['CEO_sentiment'], strux[f'return_{horizon}d'])
    print(f"CEO sentiment vs {horizon}-day return: corr={corr:.3f}, p={pval:.4e}")

AttributeError: 'numpy.dtypes.ObjectDType' object has no attribute 'dtype'

In [None]:
import statsmodels.api as sm

X = strux[['overall_call_sentiment']]  # add other controls if available
y = strux['return_1d']

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
# Convert to numeric, coercing any non-numeric entries to NaN
strux_sentiment_numeric = pd.to_numeric(strux_sentiment, errors='coerce')
stock_return_numeric = pd.to_numeric(stock_return, errors='coerce')

# Drop any NaNs to avoid errors in correlation
valid_idx = strux_sentiment_numeric.notna() & stock_return_numeric.notna()
strux_sentiment_numeric = strux_sentiment_numeric[valid_idx]
stock_return_numeric = stock_return_numeric[valid_idx]

# Compute Pearson correlation
from scipy.stats import pearsonr
corr, pval = pearsonr(strux_sentiment_numeric, stock_return_numeric)
print("Pearson correlation:", corr, "p-value:", pval)

NameError: name 'strux_sentiment' is not defined

In [None]:
from transformers import pipeline

# Use a small, fast sentiment model (Colab GPU-friendly)
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def compute_sentiment(text):
    """Compute sentiment score: positive=1, negative=-1"""
    if not text or not isinstance(text, str):
        return 0
    result = classifier(text[:512])  # truncate if too long
    return 1 if result[0]['label'] == 'POSITIVE' else -1

# Example: aggregate all prepared remarks into one string per date
strux['full_text'] = strux['prepared_by_role'].astype(str)  # ensure string

# Compute sentiment score per row
strux['strux_sentiment'] = strux['full_text'].apply(compute_sentiment)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


KeyError: 'prepared_by_role'

In [None]:
strux = pd.read_csv("/content/drive/MyDrive/DATA7001/strux_processed.csv", parse_dates=['date'])
print(strux.columns)

Index(['ticker', 'date', 'participants', 'prepared_remarks',
       'questions_and_answers', 'executives', 'operator_indices', 'questions',
       'answers'],
      dtype='object')


In [None]:
# Make sure it's string
strux['prepared_remarks_text'] = strux['prepared_remarks'].astype(str)
strux['qa_text'] = strux['questions_and_answers'].astype(str)

In [None]:
!pip install transformers --quiet

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load FinBERT (finance-specific sentiment)
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [None]:
import numpy as np

def finbert_batch_sentiment(texts, batch_size=16):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size].tolist()
        batch_results = finbert(batch)
        results.extend(batch_results)
    return results

# Example:
prepared_texts = strux['prepared_remarks_text']
sentiment_results = finbert_batch_sentiment(prepared_texts, batch_size=8)

# Add to DataFrame
strux['prepared_sentiment'] = [res['label'] for res in sentiment_results]
strux['prepared_sentiment_score'] = [res['score'] for res in sentiment_results]

RuntimeError: The size of tensor a (3097) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def chunk_text(text, max_tokens=510):
    """Split long text into chunks <= max_tokens (BERT limit)"""
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    texts = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]
    return texts

Device set to use cuda:0


In [None]:
def chunk_text_safe(text, max_model_tokens=512):
    """Split text into chunks that fit BERT/FinBERT max length."""
    max_chunk = max_model_tokens - 2  # reserve [CLS] and [SEP]
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i+max_chunk] for i in range(0, len(tokens), max_chunk)]
    texts = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]
    return texts

In [None]:
def finbert_sentiment_safe(texts, batch_size=8):
    all_results = []
    for text in texts:
        chunks = chunk_text_safe(text)
        chunk_results = []
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i+batch_size]
            chunk_results.extend(finbert(batch))
        # Average the scores over chunks
        avg_score = sum([res['score'] for res in chunk_results]) / len(chunk_results)
        # Majority vote for label
        from collections import Counter
        labels = [res['label'] for res in chunk_results]
        majority_label = Counter(labels).most_common(1)[0][0]
        all_results.append({'label': majority_label, 'score': avg_score})
    return all_results

In [None]:
def chunk_text_strict(text, max_tokens=512):
    """Split text into chunks that strictly fit the model max length."""
    # Use tokenizer to handle everything safely
    inputs = tokenizer(
        text,
        max_length=max_tokens,
        truncation=True,
        return_tensors=None
    )
    # Decode back to string
    return [tokenizer.decode(inputs['input_ids'], skip_special_tokens=True)]

In [None]:
import torch

def finbert_sentiment_safe2(texts, batch_size=8, device='cuda'):
    all_results = []
    for text in texts:
        chunks = chunk_text_strict(text)  # each chunk ≤512 tokens
        chunk_results = []
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i+batch_size]
            with torch.no_grad():
                chunk_results.extend(finbert(batch))
        # Aggregate
        avg_score = sum([res['score'] for res in chunk_results]) / len(chunk_results)
        from collections import Counter
        labels = [res['label'] for res in chunk_results]
        majority_label = Counter(labels).most_common(1)[0][0]
        all_results.append({'label': majority_label, 'score': avg_score})
    return all_results

In [None]:
prepared_texts = strux['prepared_remarks_text']
sentiment_results = finbert_sentiment_safe2(prepared_texts)

strux['prepared_sentiment'] = [res['label'] for res in sentiment_results]
strux['prepared_sentiment_score'] = [res['score'] for res in sentiment_results]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


KeyboardInterrupt: 

In [None]:
device = 0 if torch.cuda.is_available() else -1  # GPU if available
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

Device set to use cuda:0


In [None]:
from datasets import Dataset
# For prepared remarks
prepared_texts = strux['prepared_remarks'].fillna("").tolist()

# For answers
answers_texts = strux['answers'].fillna("").tolist()  # use 'answers' column

In [None]:
# Convert to Hugging Face dataset
ds_prepared = Dataset.from_dict({"text": prepared_texts})
ds_answers = Dataset.from_dict({"text": answers_texts})

In [None]:
def batch_sentiment(batch):
    # Pipeline will handle truncation automatically
    results = finbert(batch['text'])
    batch['label'] = [r['label'] for r in results]
    batch['score'] = [r['score'] for r in results]
    return batch

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import Dataset

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)

# Safe batched function
def batch_sentiment_safe(batch):
    results = []
    for text in batch["text"]:
        # Tokenizer truncates automatically to max_length=512
        output = finbert(text, truncation=True, max_length=512)
        results.append(output[0]["label"])
    return {"sentiment": results}

# Convert DataFrame columns to HuggingFace Dataset
ds_prepared = Dataset.from_dict({"text": strux['prepared_remarks'].fillna("").tolist()})
ds_answers = Dataset.from_dict({"text": strux['answers'].fillna("").tolist()})

# Apply batched map
ds_prepared = ds_prepared.map(batch_sentiment_safe, batched=True, batch_size=8)
ds_answers = ds_answers.map(batch_sentiment_safe, batched=True, batch_size=8)

# Add back to DataFrame
strux['prepared_sentiment'] = ds_prepared['sentiment']
strux['answers_sentiment'] = ds_answers['sentiment']

Device set to use cuda:0


Map:   0%|          | 0/11411 [00:00<?, ? examples/s]

In [None]:
strux['prepared_sentiment'] = ds_prepared['label']
strux['prepared_sentiment_score'] = ds_prepared['score']

strux['answers_sentiment'] = ds_answers['label']
strux['answers_sentiment_score'] = ds_answers['score']