In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [3]:
data = pd.read_csv("../Data/raw_analyst_ratings.csv")

Getting basic information

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1407328 non-null  int64 
 1   headline    1407328 non-null  object
 2   url         1407328 non-null  object
 3   publisher   1407328 non-null  object
 4   date        1407328 non-null  object
 5   stock       1407328 non-null  object
dtypes: int64(1), object(5)
memory usage: 64.4+ MB


In [6]:
data.shape

(1407328, 6)

In [7]:
data.describe()

Unnamed: 0.1,Unnamed: 0
count,1407328.0
mean,707245.4
std,408100.9
min,0.0
25%,353812.8
50%,707239.5
75%,1060710.0
max,1413848.0


Looking for missing values

In [8]:
data.isnull().sum()

Unnamed: 0    0
headline      0
url           0
publisher     0
date          0
stock         0
dtype: int64

In [9]:
# Calculate headline lengths
data['headline_length'] = data['headline'].apply(len)

# Basic statistics for headline lengths
print(data['headline_length'].describe())

count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64


In [10]:
publisher_counts = data['publisher'].value_counts()
print(publisher_counts.head())

publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Name: count, dtype: int64


In [11]:
# Convert 'date' to datetime format (if not already)
data['date'] = pd.to_datetime(data['date'])

# Publication frequency over time
data['date'].dt.date.value_counts().plot(kind='bar', figsize=(15, 5))
plt.title('Publication Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.show()

ValueError: time data "2020-05-22 00:00:00" doesn't match format "%Y-%m-%d %H:%M:%S%z", at position 10. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
data['sentiment_score'] = data['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Sentiment distribution
sns.histplot(data['sentiment_score'], bins=30)
plt.title('Sentiment Score Distribution')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

LookupError: 
**********************************************************************
  Resource [93mvader_lexicon[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('vader_lexicon')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93msentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt[0m

  Searched in:
    - 'C:\\Users\\Abdulaziz/nltk_data'
    - 'c:\\Users\\Abdulaziz\\Desktop\\10 Academy\\Nova-Financial-Solutions\\week1\\nltk_data'
    - 'c:\\Users\\Abdulaziz\\Desktop\\10 Academy\\Nova-Financial-Solutions\\week1\\share\\nltk_data'
    - 'c:\\Users\\Abdulaziz\\Desktop\\10 Academy\\Nova-Financial-Solutions\\week1\\lib\\nltk_data'
    - 'C:\\Users\\Abdulaziz\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Preprocess text (e.g., remove stop words, stemming)
# Vectorize text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['headline'])

# Create LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf_matrix)

# Print topics
print(lda.components_)

In [None]:
# Hourly publication frequency
data.set_index('date').resample('H').size().plot(figsize=(15, 5))
plt.title('Hourly Publication Frequency')
plt.xlabel('Time')
plt.ylabel('Number of Articles')
plt.show()

In [None]:
publisher_counts = data['publisher'].value_counts()
print(publisher_counts.head())

In [None]:
data['publisher_domain'] = data['publisher'].str.split('@').str[1]
publisher_domain_counts = data['publisher_domain'].value_counts()
print(publisher_domain_counts.head())