# Imports and Downloads

In [49]:
import pandas as pd
import re # For regular expressions
import nltk # Natural Language Toolkit

# Download required NLTK data (if not already downloaded)
# Using quiet=True to avoid verbose output if already present
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Optional: Configure pandas display options if needed
# pd.set_option('display.max_colwidth', 200) # To see more of the headline text

# Get the Dataset ready

In [50]:
from datasets import load_dataset
dataset = load_dataset("ashraq/financial-news")

Repo card metadata block was not found. Setting CardData to empty.


In [51]:
dataset

DatasetDict({
    train: Dataset({
        features: ['headline', 'url', 'publisher', 'date', 'stock'],
        num_rows: 1845559
    })
})

In [52]:
import pandas as pd

# Convert the train split to a DataFrame
df = pd.DataFrame(dataset['train'][:100])
df.head()

Unnamed: 0,headline,url,publisher,date,stock
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A


# Preprocessing

## General overview on dataset

In [58]:
print(f"Total number of news articles: {len(df)}")

if 'stock' in df.columns:
    print(f"Number of unique stocks: {df['stock'].nunique()}")
else:
    print("'stock' column not found for statistics.")

if 'date' in df.columns and not df['date'].dropna().empty:
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
elif 'date' in df.columns:
    print("Date range: 'date' column is empty or all NaT after conversion.")
else:
    print("'date' column not found for date range statistics.")

if 'publisher' in df.columns:
    print(f"Number of unique publishers: {df['publisher'].nunique()}")
else:
    print("'publisher' column not found for statistics.")

Total number of news articles: 100
Number of unique stocks: 1
Date range: 2019-11-07 00:00:00 to 2020-06-01 00:00:00
Number of unique publishers: 5


## Data type conversion for date

In [53]:
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    print(f"Data type of 'date' column after conversion: {df['date'].dtype}")
else:
    print("'date' column not found.")

Data type of 'date' column after conversion: datetime64[ns]


## Text Cleaning

### Declarations

In [54]:
# Your original clean_text function (more aggressive cleaning)
def clean_text_original(text):
    text = str(text).lower()  # Ensure text is string and convert to lowercase
    # Remove special characters and digits (keeps only letters and whitespace)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Suggested alternative clean_text function for LLM input
# (keeps numbers and some essential punctuation, removes HTML)
def clean_text_for_llm(text):
    text = str(text) # Ensure text is string

    # Remove HTML tags first (if any)
    text = re.sub(r'<[^>]+>', '', text)

    # Decode common HTML entities (simple version for &amp; &lt; &gt;)
    # For more comprehensive decoding, consider the 'html' library:
    # import html
    # text = html.unescape(text)
    text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')

    # Convert to lowercase
    text = text.lower()

    # Keep letters, numbers, and a basic set of punctuation.
    # This regex keeps: a-z, 0-9, whitespace, and . , ' - $ % ! ?
    # Adjust the punctuation list r'[^a-zA-Z0-9\s\.,\'\-\$\%\!\?]' as needed.
    text = re.sub(r'[^a-zA-Z0-9\s\.,\'\-\$\%\!\?]', '', text)

    # Normalize whitespace (remove extra spaces, trim leading/trailing)
    text = ' '.join(text.split())
    return text

### Apply declared functions on the headlines

In [55]:
if 'headline' in df.columns:
    # Apply original aggressive cleaning
    df['clean_headline_original'] = df['headline'].apply(clean_text_original)

    # Apply LLM-friendly cleaning
    df['llm_ready_headline'] = df['headline'].apply(clean_text_for_llm)
else:
    print("'headline' column not found.")

### Tokenizing headlines

In [56]:
if 'clean_headline_original' in df.columns:
    # Note: For LLMs, you typically feed the 'llm_ready_headline' string directly.
    # This tokenization is shown as per your original script.
    df['tokens_original'] = df['clean_headline_original'].apply(word_tokenize)
else:
    print("'clean_headline_original' column not found for tokenization.")

### Removing stopwords

In [57]:
if 'tokens_original' in df.columns:
    stop_words = set(stopwords.words('english'))
    df['tokens_no_stopwords'] = df['tokens_original'].apply(lambda token_list: [word for word in token_list if word not in stop_words])
else:
    print("'tokens_original' column not found for stopword removal.")

## Dropping unnecessary columns

In [None]:
columns_to_drop = ['url']
if columns_to_drop[0] in df.columns:
    df = df.drop(columns=columns_to_drop)
    print(f"'{columns_to_drop[0]}' column dropped successfully.")
else:
    print(f"'{columns_to_drop[0]}' column not found or already dropped.")

print("\nDataFrame columns after attempting to drop 'url':")
print(df.columns.tolist())

Step 6: Dropping 'url' column
'url' column dropped successfully.

DataFrame columns after attempting to drop 'url':
['headline', 'publisher', 'date', 'stock', 'clean_headline_original', 'llm_ready_headline', 'tokens_original', 'tokens_no_stopwords']


In [60]:
df.head()

Unnamed: 0,headline,publisher,date,stock,clean_headline_original,llm_ready_headline,tokens_original,tokens_no_stopwords
0,Agilent Technologies Announces Pricing of $5……...,GuruFocus,2020-06-01,A,agilent technologies announces pricing of mill...,agilent technologies announces pricing of $5 m...,"[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi..."
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,Zacks,2020-05-18,A,agilent a gears up for q earnings whats in the...,agilent a gears up for q2 earnings what's in t...,"[agilent, a, gears, up, for, q, earnings, what...","[agilent, gears, q, earnings, whats, cards]"
2,J.P. Morgan Asset Management Announces Liquida...,GuruFocus,2020-05-15,A,jp morgan asset management announces liquidati...,j.p. morgan asset management announces liquida...,"[jp, morgan, asset, management, announces, liq...","[jp, morgan, asset, management, announces, liq..."
3,"Pershing Square Capital Management, L.P. Buys ...",GuruFocus,2020-05-15,A,pershing square capital management lp buys agi...,"pershing square capital management, l.p. buys ...","[pershing, square, capital, management, lp, bu...","[pershing, square, capital, management, lp, bu..."
4,Agilent Awards Trilogy Sciences with a Golden ...,GuruFocus,2020-05-12,A,agilent awards trilogy sciences with a golden ...,agilent awards trilogy sciences with a golden ...,"[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t..."
