In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
import string


In [3]:
df = pd.read_csv("../src/data/analaysed_analyst_ratings/analysed_ratings.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,headline_length,day_of_week,email_domain
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A,39,Friday,
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A,42,Wednesday,
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A,29,Tuesday,
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A,44,Friday,
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A,87,Friday,


Preprocess the data

In [5]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Text preprocessing function to remove punctuation and stopwords
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

# Apply preprocessing to the headlines
df['cleaned_headline'] = df['headline'].apply(preprocess_text)
df[['headline', 'cleaned_headline']].head()


[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     EliteBook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,headline,cleaned_headline
0,Stocks That Hit 52-Week Highs On Friday,stocks hit 52week highs friday
1,Stocks That Hit 52-Week Highs On Wednesday,stocks hit 52week highs wednesday
2,71 Biggest Movers From Friday,71 biggest movers friday
3,46 Stocks Moving In Friday's Mid-Day Session,46 stocks moving fridays midday session
4,B of A Securities Maintains Neutral on Agilent...,b securities maintains neutral agilent technol...


Perform sentiment analysis

In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize VADER
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Calculate sentiment
df['sentiment_score'] = df['cleaned_headline'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['sentiment_label'] = df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Display sentiment statistics
print(df['sentiment_label'].value_counts())


[nltk_data] Downloading package vader_lexicon to C:\Users\HP
[nltk_data]     EliteBook\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


sentiment_label
neutral     731785
positive    452836
negative    222707
Name: count, dtype: int64


Identify common keywords for feature engineering

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# Find common unigrams (single words) and bigrams
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10)
common_words = vectorizer.fit_transform(df['cleaned_headline'])
print(vectorizer.get_feature_names_out())


['earnings' 'eps' 'est' 'market' 'reports' 'sales' 'shares' 'stocks'
 'update' 'vs']


In [15]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

df_sample = df.sample(1000, random_state=42)

# Convert text to a bag-of-words representation
vectorizer = CountVectorizer(max_features=500)
data_vectorized = vectorizer.fit_transform(df_sample['cleaned_headline'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(data_vectorized)
    
# Display topics
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx + 1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


Topic 1:
['markets', 'industry', 'stocks', 'lower', 'earnings', 'higher', 'update', 'market', 'trading', 'shares']
Topic 2:
['lowers', 'downgrades', 'says', 'announces', 'pt', 'buy', 'target', 'raises', 'maintains', 'price']
Topic 3:
['highs', 'watch', 'lows', 'hit', '52week', 'midday', 'premarket', 'moving', 'session', 'stocks']
Topic 4:
['high', 'biggest', 'set', 'movers', 'yesterday', '52week', 'top', 'benzingas', 'stocks', 'new']
Topic 5:
['revenue', 'q4', 'q1', 'q3', 'earnings', 'sales', 'reports', 'eps', 'est', 'vs']


Extract significant entities relevant to financial analysis.

In [16]:
import spacy

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# df_sample = df.sample(1000)  # Work on a random sample of 10000 rows

# Extract named entities
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df_sample['entities'] = df_sample['headline'].apply(extract_entities)


# Filter for financial-related entities (e.g., ORG for organizations)
print(df_sample['entities'].head())
df_sample.head()

1201723    [(Syntel, Inc., ORG), (1.11, MONEY), ($189.10M...
1282616    [(Mid-Day, DATE), (K2M Group, ORG), (Tech Data...
554075                       [(Aramchol's Failed Past, ORG)]
1190332    [(60, CARDINAL), (Thursday, DATE), (Mid-Day Se...
192958                  [(Anheuser-Busch, ORG), (APAC, ORG)]
Name: entities, dtype: object


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,headline_length,day_of_week,email_domain,cleaned_headline,sentiment_score,sentiment_label,entities
1201723,1207519,"Syntel, Inc. Reports Q1 EPS of $1.11 vs $1.03 ...",https://www.benzinga.com/news/earnings/13/04/3...,bret.kenwell@benzinga.com,,SYNT,86,,benzinga.com,syntel inc reports q1 eps 111 vs 103 est reven...,0.0,neutral,"[(Syntel, Inc., ORG), (1.11, MONEY), ($189.10M..."
1282616,1288780,Mid-Day Market Update: K2M Group Rises On Acqu...,https://www.benzinga.com/news/18/08/12285391/m...,Lisa Levin,,UEPS,83,,,midday market update k2m group rises acquisiti...,0.296,positive,"[(Mid-Day, DATE), (K2M Group, ORG), (Tech Data..."
554075,557018,Aramchol's Failed Past Isn't Its Future; Galme...,https://www.benzinga.com/analyst-ratings/analy...,Elizabeth Balboa,,GLMD,63,,,aramchols failed past isnt future galmed gets ...,-0.5106,negative,"[(Aramchol's Failed Past, ORG)]"
1190332,1196088,60 Stocks Moving In Thursday's Mid-Day Session,https://www.benzinga.com/news/20/04/15730001/6...,Lisa Levin,,SU,46,,,60 stocks moving thursdays midday session,0.0,neutral,"[(60, CARDINAL), (Thursday, DATE), (Mid-Day Se..."
192958,194114,Anheuser-Busch shares are trading lower after ...,https://www.benzinga.com/markets/wiim/19/07/14...,Benzinga Newsdesk,,BUD,150,,,anheuserbusch shares trading lower company sai...,0.0,neutral,"[(Anheuser-Busch, ORG), (APAC, ORG)]"
