# Cleaned News EDA and Save

This notebook uses the project's helper scripts to:
- load and clean the raw news CSV
- compute headline length statistics
- show publisher counts, articles per day and top words
- save the cleaned CSV to a chosen output path

Edit the `RAW_CSV_PATH` and `OUTPUT_CSV_PATH` variables below as needed.

In [1]:
# Parameters:  Paths to scripts and data

import os
import sys
from pathlib import Path
import pandas as pd

RAW_CSV_PATH = "../data/raw_analyst_ratings.csv"  # path to your raw CSV
OUTPUT_CSV_PATH = "../data/cleaned_ratings.csv"       # where the cleaned CSV will be saved
notebook_dir = Path.cwd()
project_root = notebook_dir.parent.parent
scripts_path = project_root / 'scripts'
scripts_path_str = str(scripts_path)
if scripts_path_str not in sys.path:
    sys.path.insert(0, scripts_path_str)
os.makedirs(os.path.dirname(OUTPUT_CSV_PATH) or '.', exist_ok=True)

# print(f"Raw CSV path: {RAW_CSV_PATH}")
# print(f"Output CSV path: {OUTPUT_CSV_PATH}")

In [6]:
# Import  helper functions from the project's scripts
try:
    
    from data_loader import DataLoader
    from eda_statistics import EdaStatistics
except Exception as e:
    raise ImportError(
        "Could not import project modules. "
        "Original error: " + str(e)
    )



# Load the raw CSV
if not Path(RAW_CSV_PATH).exists():
    raise FileNotFoundError(f"Raw CSV not found at {RAW_CSV_PATH}. Update RAW_CSV_PATH and re-run.")

loader = DataLoader()
statistics = EdaStatistics(min_token_len=4)
df = loader.load_news_csv(RAW_CSV_PATH)
print("Loaded dataframe with shape:", df.shape)
display(df.head())

Loaded dataframe with shape: (1407328, 6)


Unnamed: 0,unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


In [7]:
# Compute headline length features and show basic stats
df = statistics.compute_headline_lengths(df)
print("\n=== Headline Length Stats ===")
display(statistics.basic_headline_stats(df))

# Publisher counts
print("\n=== Publisher Counts (top 20) ===")
pub_counts = statistics.count_publishers(df)
display(pub_counts.head(20))

# Articles per day
print("\n=== Articles Per Day (sample) ===")
apd = statistics.articles_per_day(df)
display(apd.head(20))

# Top words in headlines
print("\n=== Top Words ===")
tw = statistics.top_words(df, n=20)
display(tw)


=== Headline Length Stats ===


Unnamed: 0,headline_len_chars,headline_len_words
count,1407328.0,1407328.0
mean,73.12051,11.41671
std,40.73531,6.352997
min,3.0,1.0
25%,47.0,7.0
50%,64.0,10.0
75%,87.0,13.0
max,512.0,77.0



=== Publisher Counts (top 20) ===


Unnamed: 0,publisher,count
0,Paul Quintaro,228373
1,Lisa Levin,186979
2,Benzinga Newsdesk,150484
3,Charles Gross,96732
4,Monica Gerson,82380
5,Eddie Staley,57254
6,Hal Lindon,49047
7,ETF Professor,28489
8,Juan Lopez,28438
9,Benzinga Staff,28114



=== Articles Per Day (sample) ===


Unnamed: 0,date_only,count
0,2011-04-27,1
1,2011-04-28,2
2,2011-04-29,2
3,2011-04-30,1
4,2011-05-01,1
5,2011-05-02,9
6,2011-05-03,3
7,2011-05-05,3
8,2011-05-06,3
9,2011-05-07,2



=== Top Words ===


Unnamed: 0,word,count
0,stocks,161776
1,from,120805
2,market,120559
3,shares,114313
4,reports,108710
5,update,91723
6,earnings,87399
7,sales,79645
8,benzinga,74516
9,week,69572


In [8]:
# Final step: save the cleaned dataframe to CSV
df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Saved cleaned CSV to: {OUTPUT_CSV_PATH}")

# Show a small confirmation
pd.read_csv(OUTPUT_CSV_PATH).head()

Saved cleaned CSV to: ../data/cleaned_ratings.csv


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,headline_len_chars,headline_len_words,date_only
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A,39,7,2020-06-05
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A,42,7,2020-06-03
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A,29,5,2020-05-26
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A,44,7,2020-05-22
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A,87,14,2020-05-22
