340w midterm project

In [None]:
# pyFinSentiment + Fintrust + Finbert; Import necessary modules
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from pyfin_sentiment.model import SentimentModel
import yfinance as yf

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
########################    pyFinSentiment  ########################

model = SentimentModel("small") # Initialize model using the small version
df = pd.read_csv("../data/processed_data/reddit_finance_posts_cleaned.csv") # Load the cleaned Reddit posts data

# Extract post content and convert it to a numpy array of strings, then use models to analyse the content/title
A = df["content"].to_numpy(str)
contentPred = model.predict(A)
B = df["title"].to_numpy(str)
titlePred = model.predict(B)

# use sentiment predictions for post content to the DataFrame
sentiment_map = {1: "Bullish", 2: "Neutral", 3: "Bearish"}
df["contentSentiment"] = [sentiment_map[int(pred)] for pred in contentPred]
df["titleSentiment"] = [sentiment_map[int(pred)] for pred in titlePred]

df.to_csv("../data/processed_data/pyFin_Predictions.csv", index=False)
print("Sentiment analysis completed and saved with descriptive labels.")

########################################################################

Sentiment analysis completed and saved with descriptive labels.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


This piece of code uses a pre-trained pyfinsentiment model to perform sentiment analysis on the content and headline of a Reddit finance post. Map the classification to three labels (" bullish ", "neutral", "bearish"), and finally add updated sentiment information to the DataFrame

In [None]:
########################    Fintrust    ########################

finbert_model_name = "yiyanghkust/finbert-tone"
finbert_tokenizer = AutoTokenizer.from_pretrained(finbert_model_name)
finbert_model = AutoModelForSequenceClassification.from_pretrained(finbert_model_name)
pyFin = SentimentModel("small")

df = pd.read_csv("../data/processed_data/reddit_finance_posts_cleaned.csv")

# Negation map from parent paper
negation_map = {
        " more ": " less ", " less ": " more ", " positive ": " negative ", " increase ": " decrease ",
        " yes ": " no ", " no ": " yes ", " unable ": " able ", " able ": " unable ",
        " decrease ": " increase ", " sales ": " buy ", " sale ": " buy ", " buy ": " sale ",
        " best ": " worst ", " worst ": " best ", " larger ": " smaller ", " smaller ": " larger ",
        " large ": " small ", " small ": " large ", " good ": " bad ", " bad ": " good ",
        " high ": " low ", " low ": " high ", " down ": " up ", " up ": " down ",
        " dislike ": " like ", " like ": " dislike ", " right ": " wrong ", " wrong ": " right ",
        " a lot of ": " few ", " many ": " few ", " few ": " many ", " little ": " much ",
        " much ": " little ", " disbelieve ": " believe ", " believe ": " disbelieve ",
        " better ": " worse ", " worse ": " better ", " revenue ": " expense ", " expense ": " revenue ",
        " abandon ": " remain ", " remain ": " abandon ", " continuing ": " stopping ", " stopping ": " continuing ",
        " continue ": " stop ", " stop ": " continue ", " approve ": " refuse ", " refuse ": " approve ",
        " grew ": " decayed ", " decayed ": " grew ", " decay ": " grow ", " growth ": " decay ",
        " grow ": " decay ", " improvement ": " degeneration ", " degeneration ": " improvement ",
        " improve ": " degenerate ", " degenerate ": " improve ", " focus ": " ignore ", " ignore ": " focus ",
        " major ": " minor ", " minor ": " major ", " strong ": " weak ", " weak ": " strong ",
        " full ": " empty ", " empty ": " full ", " start ": " end ", " end ": " start ",
        " progress ": " decline ", " decline ": " progress ", " earnings ": " cost ", " cost ": " earnings ",
        " well ": " badly ", " badly ": " well ", " expect ": " dismiss ", " dismiss ": " expect ",
        " over ": " below ", " below ": " over ", " back ": " forward ", " forward ": " back ",
        " margin ": " loss ", " profit ": " loss ", " benefits ": " loss ", " income ": " loss ",
        " loss ": " profit ", " benefit ": " harm ", " harm ": " benefit ", " slightly ": " completely ",
        " completely ": " slightly ", " most ": " least ", " least ": " most ",
        " add ": " decrease ", " change ": " unchange ", " opportunities ": " changes ", " opportunity ": " change ",
        " within ": " without ", " without ": " with ", " with ": " without "
    }  # Word pairs mapping positive to negative and vice versa

def negate_text(text):
    for word, opposite in negation_map.items():
        text = text.replace(word, opposite)
    return text

def reorder_text(text):
    parts = text.split(", ")
    return ", ".join(reversed(parts)) if len(parts) > 1 else text

def concatenate_text(text1, text2):
    return text1 + " " + text2

def transitive_text(text, ticker, top_company):
    return text.replace(ticker, top_company)

def get_finbert_sentiment(text):
    sentiment_labels = {0: "Bearish", 1: "Neutral", 2: "Bullish"} # Model specific encoding
    inputs = finbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = finbert_model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    return sentiment_labels[prediction]

def get_pyfin_sentiment(text):
    pred = pyFin.predict([text])[0]
    sentiment_map = {1: "Bullish", 2: "Neutral", 3: "Bearish"} # Model specific encoding
    return sentiment_map.get(int(pred), "Unknown")


def get_top_company(ticker):
    cleaned_ticker = ticker.strip("[]'\"")
    ticker_list = cleaned_ticker.split(", ")
    first_ticker = ticker_list[0].strip('\'')
    if first_ticker == 'RE':
        first_ticker = 'TSLA'
    stock = yf.Ticker(first_ticker)
    info = stock.info
    sector = info.get('sector')


    # Top companies by sector
    sector_top_companies = {
        'Consumer Discretionary': 'Amazon.com Inc',
        'Consumer Staples': 'Walmart',
        'Energy': 'Exxon Mobil Corp.',
        'Financials': 'JPMorgan Chase & Co.',
        'Health Care': 'Johnson & Johnson',
        'Industrials': 'Boeing Company',
        'Information Technology': 'Apple Inc.',
        'Materials': 'DowDuPont',
        'Real Estate': 'American Tower Corp A',
        'Telecommunication Services': 'AT&T Inc',
        'Utilities': 'NextEra Energy'
    }
    return sector_top_companies.get(sector, 'Tesla Inc.')


def add_consistency_checks(df):
    results = []

    for i, row in df.iterrows():
        id = row['id']
        title = row['title']
        content = row['content']
        ticker = row.get("tickers", "AAPL")
        top_company = get_top_company(ticker)

        finbert_sentiment_title = get_finbert_sentiment(title)
        pyfin_sentiment_title = get_pyfin_sentiment(title)
        finbert_sentiment_content = get_finbert_sentiment(content)
        pyfin_sentiment_content = get_pyfin_sentiment(content)

        negation_title = negate_text(title)
        reordered_title = reorder_text(title)
        combined_title = concatenate_text(title, "We expect future growth.")
        transitive_title = transitive_text(title, ticker, top_company)

        negation_content = negate_text(content)
        reordered_content = reorder_text(content)
        combined_content = concatenate_text(content, "We expect future growth.")
        transitive_content = transitive_text(content, ticker, top_company)

        results.append({
            "id": id,
            "original_title": title,
            "original_content": content,
            "ticker": ticker,
            "FinBERT_sentiment_title": finbert_sentiment_title,
            "PyFin_sentiment_title": pyfin_sentiment_title,
            "FinBERT_sentiment_content": finbert_sentiment_content,
            "PyFin_sentiment_content": pyfin_sentiment_content,
            "FinBERT_negation_title": get_finbert_sentiment(negation_title),
            "FinBERT_symmetric_title": get_finbert_sentiment(reordered_title),
            "FinBERT_additive_title": get_finbert_sentiment(combined_title),
            "FinBERT_transitive_title": get_finbert_sentiment(transitive_title),
            "FinBERT_negation_content": get_finbert_sentiment(negation_content),
            "FinBERT_symmetric_content": get_finbert_sentiment(reordered_content),
            "FinBERT_additive_content": get_finbert_sentiment(combined_content),
            "FinBERT_transitive_content": get_finbert_sentiment(transitive_content),
            "PyFin_negation_title": get_pyfin_sentiment(negation_title),
            "PyFin_symmetric_title": get_pyfin_sentiment(reordered_title),
            "PyFin_additive_title": get_pyfin_sentiment(combined_title),
            "PyFin_transitive_title": get_pyfin_sentiment(transitive_title),
            "PyFin_negation_content": get_pyfin_sentiment(negation_content),
            "PyFin_symmetric_content": get_pyfin_sentiment(reordered_content),
            "PyFin_additive_content": get_pyfin_sentiment(combined_content),
            "PyFin_transitive_content": get_pyfin_sentiment(transitive_content),
        })

    return pd.DataFrame(results)

df_consistency = add_consistency_checks(df)
df_consistency.to_csv("../data/processed_data/FinBERT_PyFin_Consistency.csv", index=False)

########################################################################


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/DD%20GME%20IT?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=DD+GME+IT&crumb=nwUivxpf0xE
404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/AMD%20AMZN%20FSLR%20TSLA?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=AMD+AMZN+FSLR+TSLA&crumb=nwUivxpf0xE
404 Client Error: Not Found for url: https://query2.financ

ChunkedEncodingError: Response ended prematurely

In [None]:
########################    Finbert    ########################

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

df = pd.read_csv("../data/processed_data/reddit_finance_posts_cleaned.csv")

sentiment_labels = {0: "Bearish", 1: "Neutral", 2: "Bullish"}

def batch_sentiment_analysis(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad(): # Disable gradient calculation to speed up inference
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=1) # Get the index of the most probable sentiment
        sentiments = [sentiment_labels[pred.item()] for pred in predictions] # Map prediction indices to sentiment labels
    return sentiments

df["contentSentiment"] = batch_sentiment_analysis(df["content"].tolist())
df["titleSentiment"] = batch_sentiment_analysis(df["title"].tolist())

df.to_csv("../data/processed_data/FinBERT_Predictions.csv", index=False)
print("Sentiment analysis completed and saved.")

########################################################################

Sentiment analysis completed and saved.


This part also uses the pre-trained FinBERT model to perform sentiment analysis on the content and titles. It applies batch processing to speed up the sentiment prediction process, mapping model outputs to sentiment labels Bearish, Neutral and Bullish" for comparing.

In [None]:
#findmatches.py
df = pd.read_csv("../data/processed_data/FinBERT_PyFin_Consistency.csv")
# Defining column names based on model, sentiment type, and context type
def get_conditions(row, model, sentiment_type, context_type):
    sentiment_col = f"{model}_{sentiment_type}_{context_type}"
    additive_col = f"{model}_additive_{context_type}"
    transitive_col = f"{model}_transitive_{context_type}"
    symmetric_col = f"{model}_symmetric_{context_type}"
    negation_col = f"{model}_negation_{context_type}"

    conditions = { # Define conditions for consistency checks
        "additive_match": row[sentiment_col] == row[additive_col],
        "transitive_match": row[sentiment_col] == row[transitive_col],
        "symmetric_match": row[sentiment_col] == row[symmetric_col],
        "negation_match": (row[sentiment_col] == "Neutral") & (row[negation_col] == "Neutral") & (row[sentiment_col] != row[negation_col]),
    }

    return conditions


df["PyFin_title_conditions"] = df.apply(lambda row: get_conditions(row, "PyFin", "sentiment", "title"), axis=1)
df["PyFin_content_conditions"] = df.apply(lambda row: get_conditions(row, "PyFin", "sentiment", "content"), axis=1)
df["FinBERT_title_conditions"] = df.apply(lambda row: get_conditions(row, "FinBERT", "sentiment", "title"), axis=1)
df["FinBERT_content_conditions"] = df.apply(lambda row: get_conditions(row, "FinBERT", "sentiment", "content"), axis=1)

# Define models and content types to iterate through for further modification
models = ["PyFin", "FinBERT"]
content_types = ["title", "content"]
list_Columns = []

# Iterate through each model and content type to negate condition values
for model in models:
    for content_type in content_types:
        conditions_col = f"{model}_{content_type}_conditions"
        if conditions_col in df.columns:
            df[conditions_col] = df[conditions_col].apply(lambda x: {key: not val for key, val in x.items()})
        list_Columns.append(conditions_col)

flattened_columns = ["id", "original_content", "original_title"] + list_Columns

df = df[flattened_columns]

df.to_csv("data/processed_data/Conditions.csv", index=False)

After sentiment analysis results and consistency checks, computes additional consistency conditions for both PyFin and FinBERT models on post titles and content, and negates (inverts) the consistency results.  

In [None]:
# finddisagreements.py
df = pd.read_csv("data/processed_data/Conditions.csv")

models = ["PyFin", "FinBERT"]
content_types = ["title", "content"]

disagreement_columns = []

for model in models:
    for content_type in content_types:

        pyfin_condition_col = f"{model}_title_conditions" if content_type == "title" else f"{model}_content_conditions"
        finbert_condition_col = f"FinBERT_{content_type}_conditions"

        disagreement_col = f"{model}_vs_FinBERT_{content_type}_disagreement"
        df[disagreement_col] = df.apply(lambda row: row[pyfin_condition_col] != row[finbert_condition_col], axis=1)
        
        disagreement_columns.append(disagreement_col)

disagreements = df[df[disagreement_columns].any(axis=1)]
disagreements.to_csv("data/processed_data/Disagreements.csv", index=False)


identifies disagreements between the PyFin and FinBERT sentiment conditions for Reddit  post titles and content, then creates new columns to record these disagreements and filters rows where any inconsistency is present between the two models,

In [None]:
# FindVariables, latent feature extraction
import numpy as np
import re

df = pd.read_csv("data/processed_data/Disagreements.csv")

def count_numbers(text):
    return len(re.findall(r'\d', text))

def count_special_characters(text):
    return len(re.findall(r'[^A-Za-z0-9\s]', text))

def count_words(text):
    return len(text.split())

def average_word_length(text):
    words = text.split()
    if len(words) == 0:
        return 0
    return np.mean([len(word) for word in words])

df['title_length'] = df['original_title'].apply(len)
df['content_length'] = df['original_content'].apply(len)

df['title_number_count'] = df['original_title'].apply(count_numbers)
df['content_number_count'] = df['original_content'].apply(count_numbers)

df['title_special_char_count'] = df['original_title'].apply(count_special_characters)
df['content_special_char_count'] = df['original_content'].apply(count_special_characters)

df['title_word_count'] = df['original_title'].apply(count_words)
df['content_word_count'] = df['original_content'].apply(count_words)

df['title_avg_word_length'] = df['original_title'].apply(average_word_length)
df['content_avg_word_length'] = df['original_content'].apply(average_word_length)

df.to_csv("data/processed_data/Enhanced_Disagreements.csv", index=False)

Several attributes were added to a dataset containing inconsistent emotional conditions for PyFin and FinBERT. Such as character length, numeric count, special character count, word count, average word length, etc., thus enhancing the dataset for further analysis.

In [None]:
import statsmodels.api as sm

df= pd.read_csv("data/processed_data/Enhanced_Disagreements.csv")

X = df[['title_length', 'content_length', 'title_number_count', 'content_number_count',
        'title_special_char_count', 'content_special_char_count', 'title_word_count',
        'content_word_count', 'title_avg_word_length', 'content_avg_word_length']]

X = sm.add_constant(X)

disagreement_columns = [col for col in df.columns if 'disagreement' in col]

for disagreement_col in disagreement_columns:
    Y = df[disagreement_col]

    model = sm.Logit(Y, X)
    result = model.fit()

    print(f"Regression Results for {disagreement_col}:")
    print(result.summary())
    print("\n" + "=" * 80 + "\n")


Finally, logistic regression analysis was performed to explore the relationship between various text measures and inconsistent conditions between the PyFin and FinBERT models. Using a statistical model to fit a logistic regression model for all the diverging columns, and summarizing the results, it is possible to predict which parameters have the most influence.