In [1]:
from pathlib import Path
import pandas as pd
import sys

In [2]:
# Notebook location
DATA_DIR = Path("../data").resolve()  
DATA_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# News file
NEWS_PATH = "hf://datasets/ashraq/financial-news/data/train-00000-of-00001-8ec327f23bbe0948.parquet"

In [4]:
print("=== Setup ===")
print(f"DATA_DIR : {DATA_DIR}")
print(f"NEWS_PATH: {NEWS_PATH}")

=== Setup ===
DATA_DIR : /Users/valentinreateguirangel/Documents/MSc Machine Learning/Finance_RAG_why_move/finance-rag-why-move/data
NEWS_PATH: hf://datasets/ashraq/financial-news/data/train-00000-of-00001-8ec327f23bbe0948.parquet


In [None]:
# Load the parquet file and preview

print("=== Loading raw news parquet ===")
print("Trying: pd.read_parquet(NEWS_PATH)")

try:
    df_raw = pd.read_parquet(NEWS_PATH)
except Exception as e:
    print("\n[ERROR] Could not read the parquet file.")
    print("Tip: If the error mentions 'pyarrow', install it:  pip install pyarrow")
    print("Full error:\n", e)
    raise

print("\n=== Load OK ===")
print(f"Rows: {len(df_raw):,}  |  Columns: {len(df_raw.columns)}")
print("Columns:", list(df_raw.columns))

print("\nFirst 5 rows (raw):")
display(df_raw.head(5))

print("\nRandom 3 rows (raw):")
display(df_raw.sample(3, random_state=42))


=== Loading raw news parquet ===
Trying: pd.read_parquet(NEWS_PATH)

=== Load OK ===
Rows: 1,845,559  |  Columns: 5
Columns: ['headline', 'url', 'publisher', 'date', 'stock']

First 5 rows (raw):


Unnamed: 0,headline,url,publisher,date,stock
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A



Random 3 rows (raw):


Unnamed: 0,headline,url,publisher,date,stock
948540,KMG Chemicals' (KMG) CEO Chris Fraser on Q1 20...,http://seekingalpha.com/article/3748906-kmg-ch...,Seeking Alpha,2015-12-10 00:00:00,KMG
769643,"90 Champions, Contenders And Challengers Are D...",https://seekingalpha.com/article/4219769-90-ch...,Seeking Alpha,2018-11-08 00:00:00,HEP
195804,"Pabrai Funds Portfolio Review: POT, BPO, BIP, ...",http://www.gurufocus.com/news/148936/pabrai-fu...,GuruFocus,2011-10-21 00:00:00,BIP



Note: We haven't changed any column names yet. This is the dataset as-is.


### Mapping text

In [7]:
# Keep only headline, publisher, date
# Map to: title, source, date

print("=== Minimal schema mapping ===")
print("Columns before:", list(df_raw.columns))

# Normalize column names
df_tmp = df_raw.copy()
df_tmp.columns = [c.strip().lower().replace(" ", "_") for c in df_tmp.columns]

print("Columns after normalize:", list(df_tmp.columns))

# Map only what we care about
rename_map = {
    "headline": "title",
    "publisher": "source",
    "date": "date"
}
df_tmp = df_tmp.rename(columns=rename_map)

# Keep subset
df_std = df_tmp[["date", "title", "source"]].copy()

print("\nPreview (5 rows):")
display(df_std.head(5))

print("\nNon-null counts:")
print(df_std.notnull().sum())

=== Minimal schema mapping ===
Columns before: ['headline', 'url', 'publisher', 'date', 'stock']
Columns after normalize: ['headline', 'url', 'publisher', 'date', 'stock']

Preview (5 rows):


Unnamed: 0,date,title,source
0,2020-06-01 00:00:00,Agilent Technologies Announces Pricing of $5……...,GuruFocus
1,2020-05-18 00:00:00,Agilent (A) Gears Up for Q2 Earnings: What's i...,Zacks
2,2020-05-15 00:00:00,J.P. Morgan Asset Management Announces Liquida...,GuruFocus
3,2020-05-15 00:00:00,"Pershing Square Capital Management, L.P. Buys ...",GuruFocus
4,2020-05-12 00:00:00,Agilent Awards Trilogy Sciences with a Golden ...,GuruFocus



Non-null counts:
date      1845559
title     1845559
source    1845559
dtype: int64


In [9]:
# Parse and clean 'date' column to ISO YYYY-MM-DD (string)

import pandas as pd

print("=== Step B1: date parsing ===")

# Quick peek at raw values
print("\nRaw date examples (as loaded):")
display(df_std["date"].head(5).to_frame())

before_rows = len(df_std)

# Parse with coercion (bad/messy -> NaT)
parsed = pd.to_datetime(df_std["date"], errors="coerce", utc=False)

# Report parsing issues
num_null = parsed.isna().sum()
print(f"\nParsed to datetime. Null after parsing: {num_null:,} / {before_rows:,}")

# Drop null dates (cannot proceed without a date)
df_dates = df_std.loc[~parsed.isna()].copy()
df_dates.loc[:, "date"] = parsed.loc[~parsed.isna()].dt.date.astype(str)  # ISO 'YYYY-MM-DD'

after_rows = len(df_dates)
dropped = before_rows - after_rows
print(f"Dropped rows with invalid dates: {dropped:,}")
print(f"Remaining rows: {after_rows:,}")

# Sanity preview
print("\nPreview after cleaning (5 rows):")
display(df_dates.head(5))

# Tiny assertions
assert "date" in df_dates.columns, "Date column missing after cleaning."
assert df_dates["date"].notnull().all(), "Found null dates after cleaning."

# Keep this cleaned frame for next steps
df_clean_dates = df_dates

=== Step B1: date parsing ===

Raw date examples (as loaded):


Unnamed: 0,date
0,2020-06-01 00:00:00
1,2020-05-18 00:00:00
2,2020-05-15 00:00:00
3,2020-05-15 00:00:00
4,2020-05-12 00:00:00



Parsed to datetime. Null after parsing: 0 / 1,845,559
Dropped rows with invalid dates: 0
Remaining rows: 1,845,559

Preview after cleaning (5 rows):


Unnamed: 0,date,title,source
0,2020-06-01,Agilent Technologies Announces Pricing of $5……...,GuruFocus
1,2020-05-18,Agilent (A) Gears Up for Q2 Earnings: What's i...,Zacks
2,2020-05-15,J.P. Morgan Asset Management Announces Liquida...,GuruFocus
3,2020-05-15,"Pershing Square Capital Management, L.P. Buys ...",GuruFocus
4,2020-05-12,Agilent Awards Trilogy Sciences with a Golden ...,GuruFocus


In [11]:
#  Drop empty title/source and remove duplicates

print("=== Step B2: cleaning title/source and deduplicating ===")

before = len(df_clean_dates)

# Drop rows with missing title or source
df_nonnull = df_clean_dates.dropna(subset=["title", "source"]).copy()
after_nonnull = len(df_nonnull)
print(f"Dropped rows with null title/source: {before - after_nonnull:,}")

# Remove duplicates
# First by exact duplicate rows
df_dedup = df_nonnull.drop_duplicates()
after_dedup1 = len(df_dedup)
print(f"Dropped exact duplicate rows: {after_nonnull - after_dedup1:,}")

# Then by (date + title + source)
df_dedup = df_dedup.drop_duplicates(subset=["date", "title", "source"])
after_dedup2 = len(df_dedup)
print(f"Dropped duplicate (date,title,source): {after_dedup1 - after_dedup2:,}")

print(f"\nRemaining rows: {after_dedup2:,}")

# Sanity preview
print("\nPreview after cleaning (5 rows):")
display(df_dedup.head(5))

# Keep cleaned frame for next step
df_clean_news = df_dedup


=== Step B2: cleaning title/source and deduplicating ===
Dropped rows with null title/source: 0
Dropped exact duplicate rows: 958,338
Dropped duplicate (date,title,source): 0

Remaining rows: 887,221

Preview after cleaning (5 rows):


Unnamed: 0,date,title,source
0,2020-06-01,Agilent Technologies Announces Pricing of $5……...,GuruFocus
1,2020-05-18,Agilent (A) Gears Up for Q2 Earnings: What's i...,Zacks
2,2020-05-15,J.P. Morgan Asset Management Announces Liquida...,GuruFocus
3,2020-05-15,"Pershing Square Capital Management, L.P. Buys ...",GuruFocus
4,2020-05-12,Agilent Awards Trilogy Sciences with a Golden ...,GuruFocus


### Saving data 

In [12]:
# Save cleaned news to CSV (no chunking)
print("=== Saving cleaned news ===")

DATA_DIR = Path("../data").resolve()
DATA_DIR.mkdir(parents=True, exist_ok=True)

# add simple unique doc_id
df_to_save = df_clean_news.copy()
df_to_save = df_to_save.reset_index(drop=True)
df_to_save["doc_id"] = ["news_" + str(i) for i in df_to_save.index]

# save
out_path = DATA_DIR / "news_clean.csv"
df_to_save.to_csv(out_path, index=False)

print(f"Saved: {out_path}")
print(f"Total rows saved: {len(df_to_save):,}")

print("\nPreview (5 rows):")
display(df_to_save.head(5))

=== Saving cleaned news ===
Saved: /Users/valentinreateguirangel/Documents/MSc Machine Learning/Finance_RAG_why_move/finance-rag-why-move/data/news_clean.csv
Total rows saved: 887,221

Preview (5 rows):


Unnamed: 0,date,title,source,doc_id
0,2020-06-01,Agilent Technologies Announces Pricing of $5……...,GuruFocus,news_0
1,2020-05-18,Agilent (A) Gears Up for Q2 Earnings: What's i...,Zacks,news_1
2,2020-05-15,J.P. Morgan Asset Management Announces Liquida...,GuruFocus,news_2
3,2020-05-15,"Pershing Square Capital Management, L.P. Buys ...",GuruFocus,news_3
4,2020-05-12,Agilent Awards Trilogy Sciences with a Golden ...,GuruFocus,news_4
