# Get the Dataset

In [13]:
from datasets import load_dataset
dataset = load_dataset("ashraq/financial-news")

Repo card metadata block was not found. Setting CardData to empty.


In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['headline', 'url', 'publisher', 'date', 'stock'],
        num_rows: 1845559
    })
})

In [15]:
import pandas as pd

# Convert the train split to a DataFrame
df = pd.DataFrame(dataset['train'][:100])
df.head()

Unnamed: 0,headline,url,publisher,date,stock
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A


# Preprocessing

In [17]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from datetime import datetime

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# 1. Convert date to datetime object
df['date'] = pd.to_datetime(df['date'])

# 2. Clean headlines
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

df['clean_headline'] = df['headline'].apply(clean_text)

# 3. Tokenize headlines
df['tokens'] = df['clean_headline'].apply(word_tokenize)

# 4. Remove stopwords
stop_words = set(stopwords.words('english'))
df['tokens_no_stopwords'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# 5. Basic statistics
print("Dataset Statistics:")
print(f"Total number of news articles: {len(df)}")
print(f"Number of unique stocks: {df['stock'].nunique()}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Number of unique publishers: {df['publisher'].nunique()}")

# 6. Display sample of processed data
print("\nSample of processed headlines:")
df[['headline', 'clean_headline', 'tokens_no_stopwords']].head()

Dataset Statistics:
Total number of news articles: 100
Number of unique stocks: 1
Date range: 2019-11-07 00:00:00 to 2020-06-01 00:00:00
Number of unique publishers: 5

Sample of processed headlines:


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\afiro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\afiro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,headline,clean_headline,tokens_no_stopwords
0,Agilent Technologies Announces Pricing of $5……...,agilent technologies announces pricing of mill...,"[agilent, technologies, announces, pricing, mi..."
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,agilent a gears up for q earnings whats in the...,"[agilent, gears, q, earnings, whats, cards]"
2,J.P. Morgan Asset Management Announces Liquida...,jp morgan asset management announces liquidati...,"[jp, morgan, asset, management, announces, liq..."
3,"Pershing Square Capital Management, L.P. Buys ...",pershing square capital management lp buys agi...,"[pershing, square, capital, management, lp, bu..."
4,Agilent Awards Trilogy Sciences with a Golden ...,agilent awards trilogy sciences with a golden ...,"[agilent, awards, trilogy, sciences, golden, t..."


In [18]:
df

Unnamed: 0,headline,url,publisher,date,stock,headline_lowercase,clean_headline,tokens,tokens_no_stopwords
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01,A,agilent technologies announces pricing of $5……...,agilent technologies announces pricing of mill...,"[agilent, technologies, announces, pricing, of...","[agilent, technologies, announces, pricing, mi..."
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18,A,agilent (a) gears up for q2 earnings: what's i...,agilent a gears up for q earnings whats in the...,"[agilent, a, gears, up, for, q, earnings, what...","[agilent, gears, q, earnings, whats, cards]"
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15,A,j.p. morgan asset management announces liquida...,jp morgan asset management announces liquidati...,"[jp, morgan, asset, management, announces, liq...","[jp, morgan, asset, management, announces, liq..."
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15,A,"pershing square capital management, l.p. buys ...",pershing square capital management lp buys agi...,"[pershing, square, capital, management, lp, bu...","[pershing, square, capital, management, lp, bu..."
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12,A,agilent awards trilogy sciences with a golden ...,agilent awards trilogy sciences with a golden ...,"[agilent, awards, trilogy, sciences, with, a, ...","[agilent, awards, trilogy, sciences, golden, t..."
...,...,...,...,...,...,...,...,...,...
95,Agilent Technologies Sees RS Rating Improve To 76,http://www.investors.com/ibd-data-stories/agil...,Investor's Business Daily,2019-11-12,A,agilent technologies sees rs rating improve to 76,agilent technologies sees rs rating improve to,"[agilent, technologies, sees, rs, rating, impr...","[agilent, technologies, sees, rs, rating, impr..."
96,Royal London Asset Management Ltd Buys Apple I...,https://www.benzinga.com/node/14785636,GuruFocus,2019-11-12,A,royal london asset management ltd buys apple i...,royal london asset management ltd buys apple i...,"[royal, london, asset, management, ltd, buys, ...","[royal, london, asset, management, ltd, buys, ..."
97,"TD Asset Management Inc Buys TC Energy Corp, i...",https://www.benzinga.com/node/14761049,GuruFocus,2019-11-07,A,"td asset management inc buys tc energy corp, i...",td asset management inc buys tc energy corp is...,"[td, asset, management, inc, buys, tc, energy,...","[td, asset, management, inc, buys, tc, energy,..."
98,"TDAM USA Inc. Buys Phillips 66, DuPont de Nemo...",https://www.benzinga.com/node/14761053,GuruFocus,2019-11-07,A,"tdam usa inc. buys phillips 66, dupont de nemo...",tdam usa inc buys phillips dupont de nemours i...,"[tdam, usa, inc, buys, phillips, dupont, de, n...","[tdam, usa, inc, buys, phillips, dupont, de, n..."
