In [None]:
import sys
import os

# Get the current working directory (where the notebook/script is running)
current_directory = os.getcwd()

# Add the project root directory to sys.path by going up one level
project_root = os.path.abspath(os.path.join(current_directory, ".."))
sys.path.append(project_root)

# Print to verify if the path is correct
print(f"Project root added to sys.path: {project_root}")

# Import EDAAnalysis from the scripts folder
from scripts.eda_analysis import EDAAnalysis

# Initialize the EDAAnalysis class with the dataset path
eda = EDAAnalysis(file_path='../data/raw_analyst_ratings/raw_analyst_ratings.csv')


#### Load the dataset


In [None]:
eda.load_data()

### Perform EDA 
#### 1. Descriptive statistics for textual lengths

In [None]:

eda.calculate_headline_length()

####  2. Count articles by publisher


In [None]:
eda.count_articles_by_publisher()

#### 3. Analyze publication dates

In [None]:
# 3. Analyze publication dates
eda.analyze_publication_dates()

### Perform Sentiment Analysis


In [None]:
# Perform Sentiment Analysis
eda.perform_sentiment_analysis()

#### Perform Topic Modeling


In [None]:
# Perform Topic Modeling
eda.perform_topic_modeling(num_topics=5, num_keywords=10)

#### Time Series Analysis



In [None]:
eda.preprocess_data()
eda.analyze_publication_frequency()
eda.analyze_publishing_times()

#### Publisher Analysis:
##### which publishers contribute most to the news feed? 
##### Is there a difference in the type of news they report?
##### If email addresses are used as publisher names, identify unique domains to see if certain organizations contribute more frequently.


In [None]:
# Extract keywords and categorize news
eda.extract_keywords()
eda.categorize_news()

# Analyze and plot publisher contributions
eda.plot_publisher_contributions(top_n=10)

# Analyze and plot category distribution
eda.plot_category_distribution()

# Extract domains and plot the top domains
eda.extract_domains_from_publishers()
eda.plot_top_domains(top_n=10)