# Importing necessary libraries

In [2]:
import sys
import os

# Get the current notebook/script directory
notebook_dir = os.getcwd()

# Define the project root path
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))  

# Add the scripts folder to the Python path
src_path = os.path.join(project_root, 'scripts')
if src_path not in sys.path:
    sys.path.append(src_path)

# Import necessary modules
import pandas as pd
from data_loading import load_csv,load_yfinance_data  
from descriptive_statistic import dataset_summary,compute_basic_stats, count_unique_symbols
#from publisher_analysis import publisher_sentiment_analysis
from text_analysis import (
    sentiment_analysis_vader, sentiment_analysis_textblob, combined_sentiment,
    sentiment_by_stock, generate_wordcloud, analyze_ngrams,
    extract_topics_from_headlines, perform_ner, plot_sentiment_distribution
)
from time_series_analysis import (
    publication_frequency_analysis, stl_decomposition, 
    time_of_day_analysis, moving_average_analysis, weekday_analysis
)
from publisher_analysis import (
    top_publishers, email_domain_analysis, news_type_analysis,
    unique_publishers_over_time, publisher_domain_analysis
)
from technical_analysis import add_technical_indicators
#from financial_metrics import calculate_financial_metrics 
from data_visualization import (
    plot_macd,plot_rsi,plot_stock_data_with_indicators,
    plot_boxplot, plot_volume_trends, plot_stock_trends)

In [5]:

# Define file paths
analyst_ratings_path = r"C:\Users\HP\Desktop\week - 1\Data\raw_analyst_ratings\raw_analyst_ratings.csv"
yfinance_folder_path = r"C:\Users\HP\Desktop\week - 1\Data\yfinance_data\yfinance_data"

# Load data
raw_analyst_ratings = load_csv(analyst_ratings_path)
#yfinance_data = load_yfinance_data(yfinance_folder_path)

# Inspect Raw Analyst Ratings
print("\nRaw Analyst Ratings Info:")
print(raw_analyst_ratings.info())
print(raw_analyst_ratings.head())

# # Inspect YFinance Data
# print("\nYFinance Data Keys (Stocks):", list(yfinance_data.keys()))
# if 'AAPL_historical_data' in yfinance_data:
#     print("\nSample AAPL Data:")
#     print(yfinance_data['AAPL_historical_data'].head())
    
    


Loaded data from C:\Users\HP\Desktop\week - 1\Data\raw_analyst_ratings\raw_analyst_ratings.csv. Shape: (1407328, 6)

Raw Analyst Ratings Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1407328 non-null  int64 
 1   headline    1407328 non-null  object
 2   url         1407328 non-null  object
 3   publisher   1407328 non-null  object
 4   date        1407328 non-null  object
 5   stock       1407328 non-null  object
dtypes: int64(1), object(5)
memory usage: 64.4+ MB
None
   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4

# EDA

## Descriptive Statistics

In [None]:
print("=== Dataset Summary ===")
print(dataset_summary(raw_analyst_ratings))

In [None]:
# Headline Statistics
print("\n=== Headline Statistics ===")
print(compute_basic_stats(raw_analyst_ratings))

In [None]:
# Unique Stock Symbols
print("\n=== Stock Symbol Analysis ===")
print(count_unique_symbols(raw_analyst_ratings))

In [None]:
raw_analyst_ratings.describe(include=[object])

Let's drop Unnamed: 0 column

In [6]:
 raw_analyst_ratings = raw_analyst_ratings.drop(columns=['Unnamed: 0'])


In [None]:
print("DataFrame Info:")
raw_analyst_ratings.info()

In [None]:
raw_analyst_ratings.head(10)

## Text Analysi:

In [None]:
 # Calculate basic statistics for headline length
raw_analyst_ratings['headline_length'] = raw_analyst_ratings['headline'].apply(len)
print("\nHeadline Length Statistics:")
print(raw_analyst_ratings['headline_length'].describe())


### Sentiment Analysis

In [3]:
raw_analyst_ratings = sentiment_analysis_vader(raw_analyst_ratings)

In [4]:
# sentiment analysis using text blob
raw_analyst_ratings = sentiment_analysis_textblob(raw_analyst_ratings)

In [6]:
# Sentiment analysis using both vader and textblob
raw_analyst_ratings = combined_sentiment(raw_analyst_ratings)

In [None]:
raw_analyst_ratings

#### word Cloud

In [None]:
# Generate Word Cloud
generate_wordcloud(raw_analyst_ratings)

####Significant Topic

In [None]:
# Extract Significant Topics
print("\n=== Extracting Significant Topics ===")
extract_topics_from_headlines(raw_analyst_ratings, top_n=10)

In [None]:

# Analyze Bi-grams
analyze_ngrams(raw_analyst_ratings, n=2, top_n=10)

In [12]:
# # Perform NER
# print("\n=== Named Entity Recognition ===")
# perform_ner(raw_analyst_ratings)

### Publisher Analysis

#### Identify unique publishers

In [None]:

unique_publishers = raw_analyst_ratings['publisher'].nunique()
print(f"\nNumber of Unique Publishers: {unique_publishers}")

#### Analyze top publishers

In [None]:
print("=== Top Publishers ===")
print(top_publishers(raw_analyst_ratings))

#### Analyze email domains in publisher names

In [None]:
print("\n=== Email Domain Analysis ===")
print(email_domain_analysis(raw_analyst_ratings))

#### Analyze news types reported by publishers

In [None]:
print("\n=== News Type Analysis ===")
keyword_list = ['FDA', 'approval', 'price target', 'earnings']
news_type_df = news_type_analysis(raw_analyst_ratings, keyword_list=keyword_list)

#### Analyze unique publishers over time

In [None]:
print("\n=== Unique Publishers Over Time ===")
print(unique_publishers_over_time(raw_analyst_ratings))

####  Analyze publisher domains

In [None]:
print("\n=== Publisher Domain Analysis ===")
print(publisher_domain_analysis(raw_analyst_ratings))

## Text Analysis (Sentiment Analysis)

we perform `sentiment analysis` on financial news headlines using **VADER (Valance Aware Dictionary anssEntiment Reasoner)** sentiment analysis tool.

## Time Series Analysis

In [None]:
print(raw_analyst_ratings['date'].head(20))


In [38]:
# Convert 'date' column to datetime format
raw_analyst_ratings['date'] = pd.to_datetime(
    raw_analyst_ratings['date'], 
    format="%Y-%m-%d %H:%M:%S", 
    errors="coerce"
)


In [None]:
invalid_dates = raw_analyst_ratings[raw_analyst_ratings['date'].isna()]
print(f"Number of invalid dates: {len(invalid_dates)}")


In [None]:
# Display a sample of the rows with invalid dates
print("Sample of invalid dates:")
print(invalid_dates[['date']].head(20))


In [None]:
total_rows = raw_analyst_ratings.shape[0]
invalid_dates_count = raw_analyst_ratings['date'].isna().sum()
invalid_date_percentage = (invalid_dates_count / total_rows) * 100

print(f"Total rows: {total_rows}")
print(f"Invalid dates: {invalid_dates_count} ({invalid_date_percentage:.2f}%)")


In [None]:
# Drop rows with NaT values in the 'date' column
raw_analyst_ratings = raw_analyst_ratings.dropna(subset=['date'])

# Verify the new shape of the dataset
print(f"Dataset after dropping invalid dates: {raw_analyst_ratings.shape}")


In [None]:
invalid_dates = raw_analyst_ratings[raw_analyst_ratings['date'].isna()]
print(f"Number of invalid dates: {len(invalid_dates)}")


In [None]:
# Check for the earliest and latest publication dates
print("\nPublication Date Range:")
print(raw_analyst_ratings['date'].min(), "to", raw_analyst_ratings['date'].max())

In [None]:
# Count the number of articles published per day
articles_per_day = raw_analyst_ratings.groupby(raw_analyst_ratings['date'].dt.date).size()
print("\nArticles Per Day:")
print(articles_per_day.head())

In [None]:
# Identify the days with the highest number of publications
print("\nTop 5 Dates with the Most Articles:")
print(articles_per_day.nlargest(5))

## EDA on Stock data

In [9]:
# Load the data
df_stocks = load_yfinance_data(yfinance_folder_path)

# Preview the data
print("First 5 rows of the combined DataFrame:")
print(df_stocks.head())

First 5 rows of the combined DataFrame:
         Date      Open      High       Low     Close  Adj Close     Volume  \
0  1980-12-12  0.128348  0.128906  0.128348  0.128348   0.098943  469033600   
1  1980-12-15  0.122210  0.122210  0.121652  0.121652   0.093781  175884800   
2  1980-12-16  0.113281  0.113281  0.112723  0.112723   0.086898  105728000   
3  1980-12-17  0.115513  0.116071  0.115513  0.115513   0.089049   86441600   
4  1980-12-18  0.118862  0.119420  0.118862  0.118862   0.091630   73449600   

   Dividends  Stock Splits Stock  
0        0.0           0.0  AAPL  
1        0.0           0.0  AAPL  
2        0.0           0.0  AAPL  
3        0.0           0.0  AAPL  
4        0.0           0.0  AAPL  


In [12]:
dataset_summary(df_stocks)

{'Total Rows': 45428,
 'Total Columns': 10,
 'Missing Values': {'Date': 0,
  'Open': 0,
  'High': 0,
  'Low': 0,
  'Close': 0,
  'Adj Close': 0,
  'Volume': 0,
  'Dividends': 0,
  'Stock Splits': 0,
  'Stock': 0},
 'Data Types': {'Date': dtype('O'),
  'Open': dtype('float64'),
  'High': dtype('float64'),
  'Low': dtype('float64'),
  'Close': dtype('float64'),
  'Adj Close': dtype('float64'),
  'Volume': dtype('int64'),
  'Dividends': dtype('float64'),
  'Stock Splits': dtype('float64'),
  'Stock': dtype('O')}}

In [13]:
print(df_stocks['Stock'].nunique())

7


In [14]:
print(df_stocks['Stock'].unique())

['AAPL' 'AMZN' 'GOOG' 'META' 'MSFT' 'NVDA' 'TSLA']
