# Importing necessary libraries

In [3]:
# for data manipulation
import pandas as pd

# For visualization
#import matplotlib.pyplot as pyplot
#import seaborn as sns

# for date/time manipulationimport datetime

#fot nlp
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# for numerical analysis
import numpy

Loading Data

In [10]:
import os
import pandas as pd

def load_data():
    """
    Load all data files from the 'yfinance_data' and 'raw_analyst_ratings' folders.
    
    Returns:
        stock_data (dict): A dictionary containing stock data DataFrames.
        raw_analyst_ratings (pd.DataFrame): DataFrame containing raw analyst ratings.
    """
    # Define the paths
    yfinance_folder = r'C:\Users\HP\Desktop\week - 1\Data\yfinance_data'
    analyst_ratings_file = r'C:\Users\HP\Desktop\week - 1\Data\raw_analyst_ratings\raw_analyst_ratings.csv'
    
    # Initialize a dictionary to store stock DataFrames
    stock_data = {}
    raw_analyst_ratings = None
    
    # Load yfinance data files
    if os.path.exists(yfinance_folder):
        yfinance_files = [f for f in os.listdir(yfinance_folder) if f.endswith('.csv')]
        for file in yfinance_files:
            file_path = os.path.join(yfinance_folder, file)
            stock_name = os.path.splitext(file)[0]  # Extract stock name without extension
            print(f"Loading file: {file} as stock: {stock_name}")
            stock_data[stock_name] = pd.read_csv(file_path)
    
    # Load raw analyst ratings file
    if os.path.exists(analyst_ratings_file):
        print(f"Loading raw analyst ratings from: {analyst_ratings_file}")
        raw_analyst_ratings = pd.read_csv(analyst_ratings_file)
    
    return stock_data, raw_analyst_ratings

# Call the function to load the data
stock_data, raw_analyst_ratings = load_data()

# Debug: Print available stock data keys
print("Loaded stock data keys:", stock_data.keys())

# Access and check raw analyst ratings
if raw_analyst_ratings is not None:
    print("\nRaw Analyst Ratings DataFrame Info:")
    print(raw_analyst_ratings.info())

    print("\nRaw Analyst Ratings Shape:")
    print(raw_analyst_ratings.shape)

    print("\nFirst few rows of Raw Analyst Ratings:")
    print(raw_analyst_ratings.head())


Loading raw analyst ratings from: C:\Users\HP\Desktop\week - 1\Data\raw_analyst_ratings\raw_analyst_ratings.csv
Loaded stock data keys: dict_keys([])

Raw Analyst Ratings DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1407328 non-null  int64 
 1   headline    1407328 non-null  object
 2   url         1407328 non-null  object
 3   publisher   1407328 non-null  object
 4   date        1407328 non-null  object
 5   stock       1407328 non-null  object
dtypes: int64(1), object(5)
memory usage: 64.4+ MB
None

Raw Analyst Ratings Shape:
(1407328, 6)

First few rows of Raw Analyst Ratings:
   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2           

# EDA

## Descriptive Statistics

In [11]:
# Check basic information about the DataFrame
print("DataFrame Info:")
raw_analyst_ratings.info()

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1407328 non-null  int64 
 1   headline    1407328 non-null  object
 2   url         1407328 non-null  object
 3   publisher   1407328 non-null  object
 4   date        1407328 non-null  object
 5   stock       1407328 non-null  object
dtypes: int64(1), object(5)
memory usage: 64.4+ MB


In [12]:
# Check for null values
print("\nNull Values per Column:")
print(raw_analyst_ratings.isnull().sum())


Null Values per Column:
Unnamed: 0    0
headline      0
url           0
publisher     0
date          0
stock         0
dtype: int64


Let's drop Unnamed: 0 column

In [13]:
raw_analyst_ratings = raw_analyst_ratings.drop(columns=['Unnamed: 0'])


In [14]:
print("DataFrame Info:")
raw_analyst_ratings.info()

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   headline   1407328 non-null  object
 1   url        1407328 non-null  object
 2   publisher  1407328 non-null  object
 3   date       1407328 non-null  object
 4   stock      1407328 non-null  object
dtypes: object(5)
memory usage: 53.7+ MB


### Text Length Analysi:

In [15]:
# Calculate basic statistics for headline length
raw_analyst_ratings['headline_length'] = raw_analyst_ratings['headline'].apply(len)
print("\nHeadline Length Statistics:")
print(raw_analyst_ratings['headline_length'].describe())



Headline Length Statistics:
count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64


### Publisher Frequency

In [16]:
# Count the number of articles per publisher
publisher_counts = raw_analyst_ratings['publisher'].value_counts()
print("\nTop Publishers by Article Count:")
print(publisher_counts.head(10))


Top Publishers by Article Count:
publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64


### Publication Date Analysis

## Text Analysis (Sentiment Analysis)

we perform `sentiment analysis` on financial news headlines using **VADER (Valance Aware Dictionary anssEntiment Reasoner)** sentiment analysis tool.

In [17]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Apply sentiment analysis to the headlines
raw_analyst_ratings['sentiment_score'] = raw_analyst_ratings['headline'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Categorize sentiment as Positive, Negative, or Neutral
raw_analyst_ratings['sentiment_category'] = raw_analyst_ratings['sentiment_score'].apply(
    lambda score: 'Positive' if score > 0.05 else ('Negative' if score < -0.05 else 'Neutral')
)

# Display sentiment statistics
print("\nSentiment Category Counts:")
print(raw_analyst_ratings['sentiment_category'].value_counts())



Sentiment Category Counts:
sentiment_category
Neutral     739338
Positive    442930
Negative    225060
Name: count, dtype: int64


## Time Series Analysis

In [19]:
print(raw_analyst_ratings['date'].head(20))


0     2020-06-05 10:30:54-04:00
1     2020-06-03 10:45:20-04:00
2     2020-05-26 04:30:07-04:00
3     2020-05-22 12:45:06-04:00
4     2020-05-22 11:38:59-04:00
5     2020-05-22 11:23:25-04:00
6     2020-05-22 09:36:20-04:00
7     2020-05-22 09:07:04-04:00
8     2020-05-22 08:37:59-04:00
9     2020-05-22 08:06:17-04:00
10          2020-05-22 00:00:00
11          2020-05-22 00:00:00
12          2020-05-21 00:00:00
13          2020-05-21 00:00:00
14          2020-05-21 00:00:00
15          2020-05-21 00:00:00
16          2020-05-18 00:00:00
17          2020-05-16 00:00:00
18          2020-05-15 00:00:00
19          2020-05-08 00:00:00
Name: date, dtype: object


In [20]:
# Convert 'date' column to datetime format
raw_analyst_ratings['date'] = pd.to_datetime(
    raw_analyst_ratings['date'], 
    format="%Y-%m-%d %H:%M:%S", 
    errors="coerce"
)


In [21]:
invalid_dates = raw_analyst_ratings[raw_analyst_ratings['date'].isna()]
print(f"Number of invalid dates: {len(invalid_dates)}")


Number of invalid dates: 55987


In [22]:
# Display a sample of the rows with invalid dates
print("Sample of invalid dates:")
print(invalid_dates[['date']].head(20))


Sample of invalid dates:
     date
0     NaT
1     NaT
2     NaT
3     NaT
4     NaT
5     NaT
6     NaT
7     NaT
8     NaT
9     NaT
1433  NaT
1434  NaT
1435  NaT
1436  NaT
1437  NaT
1438  NaT
1439  NaT
1440  NaT
1441  NaT
1442  NaT


In [23]:
total_rows = raw_analyst_ratings.shape[0]
invalid_dates_count = raw_analyst_ratings['date'].isna().sum()
invalid_date_percentage = (invalid_dates_count / total_rows) * 100

print(f"Total rows: {total_rows}")
print(f"Invalid dates: {invalid_dates_count} ({invalid_date_percentage:.2f}%)")


Total rows: 1407328
Invalid dates: 55987 (3.98%)


In [24]:
# Drop rows with NaT values in the 'date' column
raw_analyst_ratings = raw_analyst_ratings.dropna(subset=['date'])

# Verify the new shape of the dataset
print(f"Dataset after dropping invalid dates: {raw_analyst_ratings.shape}")


Dataset after dropping invalid dates: (1351341, 8)


In [25]:
invalid_dates = raw_analyst_ratings[raw_analyst_ratings['date'].isna()]
print(f"Number of invalid dates: {len(invalid_dates)}")


Number of invalid dates: 0


In [26]:
# Check for the earliest and latest publication dates
print("\nPublication Date Range:")
print(raw_analyst_ratings['date'].min(), "to", raw_analyst_ratings['date'].max())


Publication Date Range:
2009-02-14 00:00:00 to 2020-06-03 00:00:00


### Publication Frequency Over Time

## Link Sentiment with Stock Prrice