In [1]:
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import pandas as pd

In [2]:
# Load ProsusAI/finbert for embeddings
emb_model_name = 'ProsusAI/finbert'
emb_tokenizer = BertTokenizer.from_pretrained(emb_model_name)
emb_model = BertModel.from_pretrained(emb_model_name)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
# Load yiyanghkust/finbert-tone for sentiment analysis
sent_model_name = 'yiyanghkust/finbert-tone'
sent_tokenizer = BertTokenizer.from_pretrained(sent_model_name)
sent_model = BertForSequenceClassification.from_pretrained(sent_model_name)

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [4]:
df = pd.read_csv("news_data.csv")

In [8]:
from tqdm import tqdm

In [10]:
def get_embeddings(texts):
    encoded_input = emb_tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        outputs = emb_model(**encoded_input)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Extract embeddings of the [CLS] token
    return embeddings

def get_sentiment(texts):
    encoded_input = sent_tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        outputs = sent_model(**encoded_input)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1).numpy()
    return probabilities

def process_texts_in_batches(texts, batch_size=10):
    all_embeddings = []
    all_sentiments = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        embeddings = get_embeddings(batch_texts)
        sentiments = get_sentiment(batch_texts)
        all_embeddings.extend(embeddings)
        all_sentiments.extend(sentiments)
    return all_embeddings, all_sentiments

# Applying batch processing
embeddings, sentiments = process_texts_in_batches(df['summary'].tolist(), batch_size=3)
df['embeddings'] = embeddings
df['sentiment_scores'] = sentiments

100%|██████████████████████████████████████████████████████████████████████████████| 8148/8148 [52:22<00:00,  2.59it/s]


In [11]:
df

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,overall_sentiment_score,overall_sentiment_label,ticker_sentiment,date,embeddings,sentiment_scores
0,"Apple, AMC, Meta, And How Elon Musk Is Reactin...",https://www.benzinga.com/news/large-cap/22/12/...,20221231T190310,['Michael Cohen'],Benzinga examined the prospects for many inves...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",-0.079733,Neutral,"[{'ticker': 'MSTR', 'relevance_score': '0.2478...",20221231,"[0.12574883, -0.24758251, 0.44414037, -0.37607...","[0.049918726, 0.00011439536, 0.94996685]"
1,Bulls In A Bear Market: These 10 Stocks Clocke...,https://www.benzinga.com/analyst-ratings/analy...,20221231T161114,['Shanthi Rexaline'],2022 would go down as one of the worst years f...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,General,www.benzinga.com,"[{'topic': 'Life Sciences', 'relevance_score':...",0.064077,Neutral,"[{'ticker': 'AAPL', 'relevance_score': '0.1137...",20221231,"[0.055901304, 0.17285924, -0.28222713, -0.7317...","[4.4178055e-06, 2.3380646e-05, 0.9999722]"
2,Where Will Unity Software Stock Be in 3 Years?,https://www.fool.com/investing/2022/12/31/wher...,20221231T154500,['Leo Sun'],The game-engine developer still has a lot to p...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'IPO', 'relevance_score': '0.769861...",0.202065,Somewhat-Bullish,"[{'ticker': 'U', 'relevance_score': '0.095517'...",20221231,"[0.49274713, 0.46850753, -0.35436904, 0.238316...","[0.86392456, 0.006136369, 0.12993914]"
3,3 Unstoppable Growth Stocks to Buy After a Sto...,https://www.fool.com/investing/2022/12/31/3-un...,20221231T140000,['Dani Cook'],These companies have grown by triple-digit per...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",0.152163,Somewhat-Bullish,"[{'ticker': 'AMD', 'relevance_score': '0.22324...",20221231,"[0.8152653, -0.14137518, -0.18449828, -0.27026...","[1.3719941e-06, 0.99999845, 1.043882e-07]"
4,"Even in an Advertising Slowdown, These 3 Stock...",https://www.fool.com/investing/2022/12/31/even...,20221231T120000,['Adam Levy'],"Ad sales growth will slow in 2023, but these s...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",0.151938,Somewhat-Bullish,"[{'ticker': 'GOOG', 'relevance_score': '0.2433...",20221231,"[0.19915596, -0.1503704, 0.5523808, 0.14805436...","[0.0020524256, 0.011877623, 0.98607004]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24437,Nvidia's High-End Chips Ended Up In Chinese Ha...,https://www.benzinga.com/markets/asia/24/04/38...,20240423T050111,['Benzinga Neuro'],Chinese entities have managed to acquire high-...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Mergers & Acquisitions', 'relevanc...",0.095783,Neutral,"[{'ticker': 'NVDA', 'relevance_score': '0.4274...",20240423,"[0.27100983, 0.5836564, -1.0146141, -0.0952048...","[0.9999578, 1.1006024e-05, 3.1285294e-05]"
24438,Broadcom Inc. ( AVGO ) Gains But Lags Market...,https://www.zacks.com/stock/news/2259620/broad...,20240422T214516,['Zacks Equity Research'],"In the most recent trading session, Broadcom I...",https://staticx-tuner.zacks.com/images/default...,Zacks Commentary,,www.zacks.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",0.14976,Neutral,"[{'ticker': 'AVGO', 'relevance_score': '0.5372...",20240422,"[0.31473163, -1.2273293, 0.6177586, -0.3789892...","[0.99998486, 4.1800554e-06, 1.0952555e-05]"
24439,Data Center Interconnect Market size worth $ 3...,https://www.benzinga.com/pressreleases/24/04/g...,20240422T140000,['Globe Newswire'],"Jersey City, New Jersey, April 22, 2024 ( GLOB...",https://www.benzinga.com/next-assets/images/sc...,Benzinga,General,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",0.252395,Somewhat-Bullish,"[{'ticker': 'CIEN', 'relevance_score': '0.0282...",20240422,"[0.6741358, -0.5193285, 0.25507122, -0.5724481...","[0.0041528903, 0.9958448, 2.3902537e-06]"
24440,Is Nvidia Getting Ready to Disrupt Another Com...,https://www.fool.com/investing/2024/04/22/is-n...,20240422T123900,['Nicholas Rossolillo'],Some investors were second-guessing the streng...,https://g.foolcdn.com/editorial/images/773500/...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",0.200374,Somewhat-Bullish,"[{'ticker': 'MSFT', 'relevance_score': '0.0584...",20240422,"[0.5494284, -0.32981983, -0.42738038, -0.37707...","[0.0037541273, 0.9959959, 0.00024995624]"


In [15]:
df['topics'][0]

"[{'topic': 'Financial Markets', 'relevance_score': '0.316726'}\n {'topic': 'Manufacturing', 'relevance_score': '0.333333'}\n {'topic': 'Energy & Transportation', 'relevance_score': '0.333333'}\n {'topic': 'Technology', 'relevance_score': '0.333333'}]"

In [17]:
import ast

def insert_commas(string):
    # Insert a comma between '}' and '{'
    corrected_string = string.replace('}\n {', '},\n {')
    return corrected_string
def parse_string(row):
    try:
        # Insert commas to correct the syntax
        corrected_row = insert_commas(row)
        # Convert the string representation of list-dict into actual list-dict
        row_parsed = ast.literal_eval(corrected_row)
        return row_parsed
    except Exception as e:
        print(f"Error parsing row: {row} - {e}")
        return None


In [19]:
# Apply this function to your DataFrame
df['parsed_topics'] = df['topics'].apply(parse_string)

Error parsing row: topics - malformed node or string on line 1: <ast.Name object at 0x000001A898A5AA40>


In [20]:
df['parsed_topics']

0        [{'topic': 'Financial Markets', 'relevance_sco...
1        [{'topic': 'Life Sciences', 'relevance_score':...
2        [{'topic': 'IPO', 'relevance_score': '0.769861...
3        [{'topic': 'Financial Markets', 'relevance_sco...
4        [{'topic': 'Earnings', 'relevance_score': '0.9...
                               ...                        
24437    [{'topic': 'Mergers & Acquisitions', 'relevanc...
24438    [{'topic': 'Earnings', 'relevance_score': '0.9...
24439    [{'topic': 'Financial Markets', 'relevance_sco...
24440    [{'topic': 'Financial Markets', 'relevance_sco...
24441    [{'topic': 'Manufacturing', 'relevance_score':...
Name: parsed_topics, Length: 24442, dtype: object

In [22]:
from collections import Counter
# Flatten the list of dictionaries to extract topics
topics = [d['topic'] for sublist in df['parsed_topics'] if sublist is not None for d in sublist if 'topic' in d]

# Count occurrences of each topic
topic_counts = Counter(topics)
topic_counts

Counter({'Technology': 17890,
         'Financial Markets': 17165,
         'Manufacturing': 13427,
         'Earnings': 11387,
         'Finance': 7142,
         'Retail & Wholesale': 6229,
         'Economy - Monetary': 4773,
         'Life Sciences': 4032,
         'Energy & Transportation': 2139,
         'Real Estate & Construction': 1334,
         'Blockchain': 1175,
         'IPO': 1026,
         'Mergers & Acquisitions': 749,
         'Economy - Fiscal': 512,
         'Economy - Macro': 302})

In [24]:
import numpy as np
topics = ['Technology', 'Financial Markets', 'Manufacturing', 'Earnings', 'Finance', 'Retail & Wholesale', 'Economy - Monetary', 'Life Sciences', 'Energy & Transportation', 'Real Estate & Construction', 'Blockchain', 'IPO', 'Mergers & Acquisitions', 'Economy - Fiscal', 'Economy - Macro']

# Initialize columns for each topic with NaNs
for topic in topics:
    df[topic] = np.nan

# Populate the topic columns with relevance scores
for index, row in df.iterrows():
    if row['parsed_topics'] is not None:
        for item in row['parsed_topics']:
            topic = item['topic']
            relevance_score = float(item['relevance_score'])  # Convert string to float if necessary
            if topic in df.columns:
                df.at[index, topic] = relevance_score
df

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,...,Retail & Wholesale,Economy - Monetary,Life Sciences,Energy & Transportation,Real Estate & Construction,Blockchain,IPO,Mergers & Acquisitions,Economy - Fiscal,Economy - Macro
0,"Apple, AMC, Meta, And How Elon Musk Is Reactin...",https://www.benzinga.com/news/large-cap/22/12/...,20221231T190310,['Michael Cohen'],Benzinga examined the prospects for many inves...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",...,,,,0.333333,,,,,,
1,Bulls In A Bear Market: These 10 Stocks Clocke...,https://www.benzinga.com/analyst-ratings/analy...,20221231T161114,['Shanthi Rexaline'],2022 would go down as one of the worst years f...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,General,www.benzinga.com,"[{'topic': 'Life Sciences', 'relevance_score':...",...,,,0.166667,0.166667,0.166667,,0.158519,,,
2,Where Will Unity Software Stock Be in 3 Years?,https://www.fool.com/investing/2022/12/31/wher...,20221231T154500,['Leo Sun'],The game-engine developer still has a lot to p...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'IPO', 'relevance_score': '0.769861...",...,,,,,,,0.769861,0.360215,,
3,3 Unstoppable Growth Stocks to Buy After a Sto...,https://www.fool.com/investing/2022/12/31/3-un...,20221231T140000,['Dani Cook'],These companies have grown by triple-digit per...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",...,,,,,,,,,,
4,"Even in an Advertising Slowdown, These 3 Stock...",https://www.fool.com/investing/2022/12/31/even...,20221231T120000,['Adam Levy'],"Ad sales growth will slow in 2023, but these s...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",...,,,,,,0.158519,,,,0.158519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24437,Nvidia's High-End Chips Ended Up In Chinese Ha...,https://www.benzinga.com/markets/asia/24/04/38...,20240423T050111,['Benzinga Neuro'],Chinese entities have managed to acquire high-...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Mergers & Acquisitions', 'relevanc...",...,,,,,,,,0.108179,,
24438,Broadcom Inc. ( AVGO ) Gains But Lags Market...,https://www.zacks.com/stock/news/2259620/broad...,20240422T214516,['Zacks Equity Research'],"In the most recent trading session, Broadcom I...",https://staticx-tuner.zacks.com/images/default...,Zacks Commentary,,www.zacks.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",...,,,,,,,,,,
24439,Data Center Interconnect Market size worth $ 3...,https://www.benzinga.com/pressreleases/24/04/g...,20240422T140000,['Globe Newswire'],"Jersey City, New Jersey, April 22, 2024 ( GLOB...",https://www.benzinga.com/next-assets/images/sc...,Benzinga,General,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",...,,,,,,,,,,
24440,Is Nvidia Getting Ready to Disrupt Another Com...,https://www.fool.com/investing/2024/04/22/is-n...,20240422T123900,['Nicholas Rossolillo'],Some investors were second-guessing the streng...,https://g.foolcdn.com/editorial/images/773500/...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",...,,,,,,,,,,


In [27]:
df[topics] = df[topics].fillna(0)

In [39]:
df['date']

0        20221231
1        20221231
2        20221231
3        20221231
4        20221231
           ...   
24437    20240423
24438    20240422
24439    20240422
24440    20240422
24441    20240422
Name: date, Length: 24442, dtype: object

In [45]:
df['date'] = pd.to_datetime(df['date'].astype(str), format='%Y%m%d', errors ='coerce')

In [46]:
df = df.dropna(subset='date')

In [54]:
df.to_csv("news_data_processed.csv", index = False)

In [55]:
df.columns

Index(['title', 'url', 'time_published', 'authors', 'summary', 'banner_image',
       'source', 'category_within_source', 'source_domain', 'topics',
       'overall_sentiment_score', 'overall_sentiment_label',
       'ticker_sentiment', 'date', 'embeddings', 'sentiment_scores',
       'parsed_topics', 'Technology', 'Financial Markets', 'Manufacturing',
       'Earnings', 'Finance', 'Retail & Wholesale', 'Economy - Monetary',
       'Life Sciences', 'Energy & Transportation',
       'Real Estate & Construction', 'Blockchain', 'IPO',
       'Mergers & Acquisitions', 'Economy - Fiscal', 'Economy - Macro',
       'date_formatted'],
      dtype='object')