In [1]:
!pip install datasets transformers


Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0

Step 2: Load the dataset

In [2]:
from datasets import load_dataset

news_ds = load_dataset("ashraq/financial-news-articles")


README.md:   0%|          | 0.00/543 [00:00<?, ?B/s]

data/train-00000-of-00002-a3f58f0eb179f9(…):   0%|          | 0.00/238M [00:00<?, ?B/s]

data/train-00001-of-00002-50e0d6558d1357(…):   0%|          | 0.00/255M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/306242 [00:00<?, ? examples/s]

Step 3: Inspect the dataset

In [3]:
print(news_ds)
print(news_ds["train"][0])


DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'url'],
        num_rows: 306242
    })
})
{'title': 'BRIEF-Bigger Capital Fund Reports An 8 Pct Passive Stake In Akers Biosciences', 'text': 'January 2, 2018 / 9:31 PM / Updated 8 minutes ago BRIEF-Bigger Capital Fund Reports An 8 Pct Passive Stake In Akers Biosciences Reuters Staff 1 Min Read \nJan 2 (Reuters) - Biosciences Inc: \n* BIGGER CAPITAL FUND LP REPORTS AN 8 PCT PASSIVE STAKE IN AKERS AS OF DEC 21, 2017 - SEC FILING Source text : ( bit.ly/2lHEjFS ) Further company coverage:', 'url': 'https://www.reuters.com/article/brief-bigger-capital-fund-reports-an-8-p/brief-bigger-capital-fund-reports-an-8-pct-passive-stake-in-akers-biosciences-idUSFWN1OX0NY'}


Step 4: Convert to Pandas

In [4]:
import pandas as pd

news_df = news_ds["train"].to_pandas()
news_df.head()


Unnamed: 0,title,text,url
0,BRIEF-Bigger Capital Fund Reports An 8 Pct Pas...,"January 2, 2018 / 9:31 PM / Updated 8 minutes ...",https://www.reuters.com/article/brief-bigger-c...
1,Global Markets: Asia shares reach decade top o...,NEW YORK (Reuters) - European stocks closed lo...,https://in.reuters.com/article/global-markets/...
2,Donald Trump is the only person in Washington ...,Fears of a government shutdown coursed through...,https://www.cnbc.com/2018/01/18/donald-trump-t...
3,Actor Casey Affleck withdraws as 2018 Oscar pr...,03 PM / Updated 19 minutes ago Actor Casey Af...,https://www.reuters.com/article/us-oscars-case...
4,EU mulls new link between budget and civic rights,"January 22, 2018 / 7:23 PM / Updated 2 hours a...",https://uk.reuters.com/article/uk-eu-poland-bu...


Extract publication date (CRITICAL)

In [5]:
import re
import pandas as pd

def extract_date(text):
    match = re.search(r'([A-Z][a-z]+ \d{1,2}, \d{4})', text)
    return match.group(1) if match else None

news_df = news_ds["train"].to_pandas()

news_df["date"] = news_df["text"].apply(extract_date)
news_df["date"] = pd.to_datetime(news_df["date"], errors="coerce")

news_df = news_df.dropna(subset=["date"])
news_df.head()


Unnamed: 0,title,text,url,date
0,BRIEF-Bigger Capital Fund Reports An 8 Pct Pas...,"January 2, 2018 / 9:31 PM / Updated 8 minutes ...",https://www.reuters.com/article/brief-bigger-c...,2018-01-02
1,Global Markets: Asia shares reach decade top o...,NEW YORK (Reuters) - European stocks closed lo...,https://in.reuters.com/article/global-markets/...,2018-01-02
2,Donald Trump is the only person in Washington ...,Fears of a government shutdown coursed through...,https://www.cnbc.com/2018/01/18/donald-trump-t...,2018-01-18
4,EU mulls new link between budget and civic rights,"January 22, 2018 / 7:23 PM / Updated 2 hours a...",https://uk.reuters.com/article/uk-eu-poland-bu...,2018-01-22
5,BRIEF-Sandridge Energy Rejects Icahn's Proposa...,"January 23, 2018 / 9:42 PM / Updated 10 minute...",https://www.reuters.com/article/brief-sandridg...,2018-01-23


Light text cleaning (DO NOT over-clean)

In [6]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Reuters Staff.*?Read', '', text)
    return text.strip()

news_df["clean_text"] = news_df["text"].apply(clean_text)
news_df = news_df[["date", "title", "clean_text", "url"]]


Keep only market-relevant articles

In [7]:
keywords = [
    "shares", "stocks", "market", "profit", "loss",
    "earnings", "revenue", "prices", "rates"
]

mask = news_df["clean_text"].str.lower().str.contains(
    "|".join(keywords)
)

news_df = news_df[mask]
print(len(news_df))


70538


STEP 1: LOAD & PREPARE STOOQ NUMERICAL DATA

STEP 0 (CRITICAL): INSPECT ONE STOOQ FILE

In [25]:
import os

BASE_PATH = "/kaggle/input/stooq-numerical-data/data/daily/us"

# Find ONE .txt file anywhere
sample_file = None
for root, dirs, files in os.walk(BASE_PATH):
    for f in files:
        if f.endswith(".txt"):
            sample_file = os.path.join(root, f)
            break
    if sample_file:
        break

print("Sample file:", sample_file)

# Print first 10 lines RAW
with open(sample_file, "r", encoding="utf-8", errors="ignore") as f:
    for i in range(10):
        print(f.readline())


Sample file: /kaggle/input/stooq-numerical-data/data/daily/us/nyse etfs/2/tmsl.us.txt
<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>

TMSL.US,D,20230616,000000,25.2,25.2,25.12,25.1876,3703,0

TMSL.US,D,20230620,000000,25.14,25.14,25,25.1017,4332,0

TMSL.US,D,20230621,000000,25.08,25.08,25.044,25.06,4200,0

TMSL.US,D,20230622,000000,24.9,24.9,24.85,24.8634,764,0

TMSL.US,D,20230623,000000,24.59,24.59,24.5805,24.5831,728,0

TMSL.US,D,20230626,000000,24.72,24.7216,24.72,24.7216,2687,0

TMSL.US,D,20230627,000000,24.82,25.1082,24.82,25.1082,500,0

TMSL.US,D,20230628,000000,25.1019,25.1019,25.1019,25.1019,200,0

TMSL.US,D,20230629,000000,25.3893,25.3893,25.3893,25.3893,31,0



STEP 1: Correct parser for <DATE>;<OPEN>;... format

In [27]:
import os
import pandas as pd

BASE_PATH = "/kaggle/input/stooq-numerical-data/data/daily/us"

market_folders = [
    "nasdaq stocks",
    "nyse stocks",
    "nysemkt stocks",
    "nyse etfs"   # optional, but your sample came from here
]

price_dfs = []
loaded_files = 0
skipped_files = 0

for market in market_folders:
    market_path = os.path.join(BASE_PATH, market)
    print(f"\nScanning market: {market}")

    if not os.path.exists(market_path):
        continue

    for root, dirs, files in os.walk(market_path):
        for file in files:
            if not file.endswith(".txt"):
                continue

            file_path = os.path.join(root, file)

            try:
                # Read CSV (comma-separated)
                df = pd.read_csv(file_path)

                # Normalize headers: <DATE> → date
                df.columns = (
                    df.columns
                    .str.replace("<", "", regex=False)
                    .str.replace(">", "", regex=False)
                    .str.lower()
                )

                # Required columns (based on REAL format)
                required_cols = {"date", "open", "high", "low", "close"}
                if not required_cols.issubset(df.columns):
                    skipped_files += 1
                    continue

                # Parse date: YYYYMMDD → datetime
                df["date"] = pd.to_datetime(df["date"], format="%Y%m%d", errors="coerce")
                df = df.dropna(subset=["date"])

                if df.empty:
                    skipped_files += 1
                    continue

                # Add metadata
                df["ticker"] = df["ticker"].str.replace(".US", "", regex=False)
                df["exchange"] = market.replace(" stocks", "").replace(" etfs", "")

                price_dfs.append(
                    df[["date", "open", "high", "low", "close", "vol", "ticker", "exchange"]]
                )
                loaded_files += 1

            except Exception as e:
                skipped_files += 1
                continue

print(f"\nLoaded files: {loaded_files}")
print(f"Skipped files: {skipped_files}")



Scanning market: nasdaq stocks

Scanning market: nyse stocks

Scanning market: nysemkt stocks

Scanning market: nyse etfs

Loaded files: 10731
Skipped files: 467


In [28]:
loaded_files


10731

STEP 3: CONCATENATE SAFELY

In [29]:
prices_df = pd.concat(price_dfs, ignore_index=True)

prices_df = prices_df.sort_values(["ticker", "date"])
prices_df.head()


Unnamed: 0,date,open,high,low,close,vol,ticker,exchange
16671074,1999-11-18,29.5594,32.4842,25.9889,28.5858,68866780.0,A,nyse
16671075,1999-11-19,27.8972,27.9371,25.8613,26.2311,16773580.0,A,nyse
16671076,1999-11-22,26.837,28.5858,26.0278,28.5858,7242576.0,A,nyse
16671077,1999-11-23,27.6102,28.3377,25.9889,25.9889,6579458.0,A,nyse
16671078,1999-11-24,26.0637,27.2445,25.9889,26.6745,5332648.0,A,nyse


Compute returns & volatility

In [31]:
prices_df["return_t1"] = prices_df.groupby("ticker")["close"].pct_change(1)
prices_df["return_t5"] = prices_df.groupby("ticker")["close"].pct_change(5)

prices_df["volatility_5"] = (
    prices_df.groupby("ticker")["return_t1"]
    .rolling(5)
    .std()
    .reset_index(level=0, drop=True)
)


Prepare for alignment with news

In [None]:
prices_df["date"] = prices_df["date"].dt.date


## ALIGN NEWS ARTICLES WITH STOCK PRICES

Prepare PRICE data (FINAL)

In [69]:
import pandas as pd

# Ensure date format
prices_df["date"] = pd.to_datetime(prices_df["date"]).dt.date

# Drop rows without sufficient history
final_prices_df = prices_df.dropna(
    subset=["return_t1", "return_t5", "volatility_5"]
).copy()

print("Valid price rows:", len(final_prices_df))
final_prices_df.head(15)


Valid price rows: 25648527


Unnamed: 0,date,ticker,exchange,close,return_t1,return_t5,volatility_5
16671079,1999-11-26,A,nyse,26.7622,0.003288,-0.063794,0.076198
16671080,1999-11-29,A,nyse,27.363,0.02245,0.043151,0.065194
16671081,1999-11-30,A,nyse,27.411,0.001754,-0.041097,0.047941
16671082,1999-12-01,A,nyse,27.8972,0.017737,0.073428,0.011212
16671083,1999-12-02,A,nyse,28.6655,0.02754,0.074641,0.011531
16671084,1999-12-03,A,nyse,28.9087,0.008484,0.080206,0.010445
16671085,1999-12-06,A,nyse,29.7219,0.02813,0.086208,0.01162
16671086,1999-12-07,A,nyse,29.3969,-0.010935,0.072449,0.016194
16671087,1999-12-08,A,nyse,29.3571,-0.001354,0.052331,0.017358
16671088,1999-12-09,A,nyse,29.7627,0.013816,0.038276,0.014871


Build CLEAN ticker universe (CRITICAL)

In [70]:
# Keep only realistic equity tickers (2–5 chars)
valid_tickers = {
    t for t in final_prices_df["ticker"].unique()
    if 2 <= len(t) <= 5
}

len(valid_tickers)


10556

Define BLOCKLIST (REVIEWER-SAFE)

In [71]:
BLOCKLIST = {
    "LIVE","EPS","USD","ET","TV","CEO","CFO",
    "JAN","FEB","MAR","APR","MAY","JUN",
    "JUL","AUG","SEP","OCT","NOV","DEC",
    "AM","PM","US","EU","UK"
}


Extract CLEAN tickers from NEWS

In [72]:
import re

def extract_clean_tickers(text):
    candidates = set(re.findall(r"\b[A-Z]{2,5}\b", text))
    return list((candidates & valid_tickers) - BLOCKLIST)

news_df["tickers"] = news_df["title"].apply(extract_clean_tickers)

# Keep only rows with at least one valid ticker
news_df = news_df[news_df["tickers"].map(len) > 0]

news_df.head()


Unnamed: 0,date,title,clean_text,url,ticker,tickers
13,2017-09-30,IMAX Corporation To Announce Fourth-Quarter An...,"NEW YORK, Jan. 24, 2018 /PRNewswire/ -- IMAX C...",http://www.cnbc.com/2018/01/24/pr-newswire-ima...,IMAX,[IMAX]
102,2016-06-19,China December factory growth quickens to four...,BEIJING (Reuters) - Growth in China’s manufact...,https://in.reuters.com/article/china-economy-p...,PMI,[PMI]
312,2018-01-24,Trump to name tax lawyer Rettig to head IRS - ...,"January 24, 2018 / 2:55 AM / Updated 12 minute...",https://uk.reuters.com/article/uk-usa-trump-ir...,IRS,[IRS]
433,2018-02-06,Aviragen Board Of Directors Unanimously Recomm...,Transaction Creates a Leading Vaccine Company ...,http://www.cnbc.com/2018/01/11/globe-newswire-...,FOR,[FOR]
549,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,LAW,"[LAW, RM]"


EXPLODE (one news → one ticker)

In [73]:
news_df = news_df.explode("tickers").rename(columns={"tickers": "ticker"})

news_df = news_df[[
    "date","title","clean_text","url","ticker"
]]

news_df.head()


Unnamed: 0,date,title,clean_text,url,ticker,ticker.1
13,2017-09-30,IMAX Corporation To Announce Fourth-Quarter An...,"NEW YORK, Jan. 24, 2018 /PRNewswire/ -- IMAX C...",http://www.cnbc.com/2018/01/24/pr-newswire-ima...,IMAX,IMAX
102,2016-06-19,China December factory growth quickens to four...,BEIJING (Reuters) - Growth in China’s manufact...,https://in.reuters.com/article/china-economy-p...,PMI,PMI
312,2018-01-24,Trump to name tax lawyer Rettig to head IRS - ...,"January 24, 2018 / 2:55 AM / Updated 12 minute...",https://uk.reuters.com/article/uk-usa-trump-ir...,IRS,IRS
433,2018-02-06,Aviragen Board Of Directors Unanimously Recomm...,Transaction Creates a Leading Vaccine Company ...,http://www.cnbc.com/2018/01/11/globe-newswire-...,FOR,FOR
549,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,LAW,LAW


ALIGN NEWS WITH PRICE DATA (CORE STEP)

In [75]:
news_df.columns


Index(['date', 'title', 'clean_text', 'url', 'ticker', 'ticker'], dtype='object')

In [76]:
final_prices_df.columns


Index(['date', 'ticker', 'exchange', 'close', 'return_t1', 'return_t5',
       'volatility_5'],
      dtype='object')

In [77]:
# Keep only ONE ticker column
news_df = news_df.loc[:, ~news_df.columns.duplicated()]


In [78]:
final_prices_df = final_prices_df.loc[:, ~final_prices_df.columns.duplicated()]


In [80]:
assert news_df.columns.is_unique
assert final_prices_df.columns.is_unique


In [81]:
merged_df = pd.merge(
    news_df,
    final_prices_df,
    on=["date", "ticker"],
    how="inner"
)

merged_df.head()


Unnamed: 0,date,title,clean_text,url,ticker,exchange,close,return_t1,return_t5,volatility_5
0,2018-01-24,Trump to name tax lawyer Rettig to head IRS - ...,"January 24, 2018 / 2:55 AM / Updated 12 minute...",https://uk.reuters.com/article/uk-usa-trump-ir...,IRS,nyse,28.0383,-0.058179,-0.014003,0.038975
1,2018-02-06,Aviragen Board Of Directors Unanimously Recomm...,Transaction Creates a Leading Vaccine Company ...,http://www.cnbc.com/2018/01/11/globe-newswire-...,FOR,nyse,24.65,-0.006048,0.102908,0.059961
2,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,RM,nyse,21.0829,0.009906,-0.098405,0.03809
3,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,RM,nyse,21.0829,0.009906,-0.098405,0.03809
4,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,RM,nyse,21.0829,0.009906,-0.098405,0.03809


In [82]:
def dedupe_columns(df):
    return df.loc[:, ~df.columns.duplicated()]

news_df = dedupe_columns(news_df)
final_prices_df = dedupe_columns(final_prices_df)


CREATE WEAK CAUSALITY LABEL (BASELINE)

In [83]:
merged_df["causal_label"] = (
    (merged_df["return_t1"].abs() > merged_df["return_t1"].quantile(0.75)) |
    (merged_df["volatility_5"] > merged_df["volatility_5"].quantile(0.75))
).astype(int)

merged_df[["title","ticker","causal_label"]].head()


Unnamed: 0,title,ticker,causal_label
0,Trump to name tax lawyer Rettig to head IRS - ...,IRS,1
1,Aviragen Board Of Directors Unanimously Recomm...,FOR,1
2,RM LAW Announces Class Action Lawsuit Against ...,RM,1
3,RM LAW Announces Class Action Lawsuit Against ...,RM,1
4,RM LAW Announces Class Action Lawsuit Against ...,RM,1


fINAL MODEL-READY DATASET

In [84]:
final_df = merged_df[[
    "date",
    "ticker",
    "title",
    "clean_text",
    "return_t1",
    "return_t5",
    "volatility_5",
    "causal_label"
]]

final_df.head()


Unnamed: 0,date,ticker,title,clean_text,return_t1,return_t5,volatility_5,causal_label
0,2018-01-24,IRS,Trump to name tax lawyer Rettig to head IRS - ...,"January 24, 2018 / 2:55 AM / Updated 12 minute...",-0.058179,-0.014003,0.038975,1
1,2018-02-06,FOR,Aviragen Board Of Directors Unanimously Recomm...,Transaction Creates a Leading Vaccine Company ...,-0.006048,0.102908,0.059961,1
2,2017-02-10,RM,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",0.009906,-0.098405,0.03809,1
3,2017-02-10,RM,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",0.009906,-0.098405,0.03809,1
4,2017-02-10,RM,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",0.009906,-0.098405,0.03809,1


QUICK SANITY CHECK (IMPORTANT)

In [85]:
final_df["ticker"].value_counts().head(15)


ticker
A      4635
CGO    3889
CCD    3889
CHW    3888
CHY    3888
CHI    3888
CSQ    3888
ON     3516
FOR    3194
C      2875
ET     2684
IPO    2543
RPT    2362
TEN    2169
PM     2082
Name: count, dtype: int64

In [86]:
ETF_BLOCKLIST = {
    "CGO","CCD","CHW","CHY","CHI","CSQ",
    "RPT","TEN","IPO"
}


In [87]:
AMBIGUOUS = {
    "ON","FOR","ET","PM","AM","US","EU"
}


In [88]:
BLOCKLIST = ETF_BLOCKLIST | AMBIGUOUS

merged_df = merged_df[~merged_df["ticker"].isin(BLOCKLIST)]


In [89]:
merged_df["ticker"].value_counts().head(20)


ticker
A       4635
C       2875
GS      2068
NKE     2058
NVDA    2058
UAL     2058
IEP     2058
B       1973
FUND    1876
AGD     1875
AWP     1875
AOD     1875
V       1787
BOX     1685
CX      1298
BY      1197
D        895
LIVE     815
SE       746
USA      691
Name: count, dtype: int64

In [93]:
FINAL_BLOCKLIST = {
    # Funds / ETFs
    "AGD","AWP","AOD","FUND",

    # News / generic terms
    "LIVE","USA","BY",

    # Single-letter / ambiguous
    "A","B","C","D",

    # High-ambiguity tickers
    "CX","IEP"
}


In [94]:
merged_df = merged_df[~merged_df["ticker"].isin(FINAL_BLOCKLIST)]


In [95]:
merged_df["ticker"].value_counts().head(20)


ticker
GS      2068
NKE     2058
NVDA    2058
UAL     2058
V       1787
BOX     1685
SE       746
HSBC     683
BRF      642
ALL      623
M        592
ALEX     554
DB       532
PR       526
T        497
GM       456
E        454
UBS      404
TWO      384
KEN      384
Name: count, dtype: int64

In [96]:
merged_df


Unnamed: 0,date,title,clean_text,url,ticker,exchange,close,return_t1,return_t5,volatility_5,causal_label
0,2018-01-24,Trump to name tax lawyer Rettig to head IRS - ...,"January 24, 2018 / 2:55 AM / Updated 12 minute...",https://uk.reuters.com/article/uk-usa-trump-ir...,IRS,nyse,28.03830,-0.058179,-0.014003,0.038975,1
2,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,RM,nyse,21.08290,0.009906,-0.098405,0.038090,1
3,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,RM,nyse,21.08290,0.009906,-0.098405,0.038090,1
4,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,RM,nyse,21.08290,0.009906,-0.098405,0.038090,1
5,2017-02-10,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",http://www.cnbc.com/2018/01/23/pr-newswire-rm-...,RM,nyse,21.08290,0.009906,-0.098405,0.038090,1
...,...,...,...,...,...,...,...,...,...,...,...
102512,2018-05-10,HEI Reports First Quarter 2018 Earnings,"HONOLULU, May 10, 2018 /PRNewswire/ -- Hawaiia...",http://www.cnbc.com/2018/05/10/pr-newswire-hei...,HEI,nyse,73.27640,0.000655,0.059425,0.009102,0
102513,2018-05-21,"NRG Energy, Inc. Prices Offering of $500 Milli...","PRINCETON, N.J.--(BUSINESS WIRE)-- NRG Energy,...",http://www.cnbc.com/2018/05/21/business-wire-n...,NRG,nyse,30.05730,-0.006272,-0.036850,0.013257,1
102514,2018-05-09,"Competition from Fortnite helps EA, Activision...","Competition from Fortnite helps EA, Activision...",https://uk.reuters.com/video/2018/05/09/compet...,EA,nasdaq,129.45200,0.057502,0.101504,0.023968,1
102515,2018-05-31,UPDATE 1-Japan Post Insurance ready to buy sho...,"May 31, 2018 / 7:59 AM / Updated 2 minutes ago...",https://www.reuters.com/article/us-japanpostin...,CIO,nyse,9.25479,0.006185,0.045675,0.014704,1


In [97]:
final_df = merged_df[[
    "date",
    "ticker",
    "title",
    "clean_text",
    "return_t1",
    "return_t5",
    "volatility_5",
    "causal_label"
]].copy()

final_df.head()


Unnamed: 0,date,ticker,title,clean_text,return_t1,return_t5,volatility_5,causal_label
0,2018-01-24,IRS,Trump to name tax lawyer Rettig to head IRS - ...,"January 24, 2018 / 2:55 AM / Updated 12 minute...",-0.058179,-0.014003,0.038975,1
2,2017-02-10,RM,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",0.009906,-0.098405,0.03809,1
3,2017-02-10,RM,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",0.009906,-0.098405,0.03809,1
4,2017-02-10,RM,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",0.009906,-0.098405,0.03809,1
5,2017-02-10,RM,RM LAW Announces Class Action Lawsuit Against ...,"BERWYN, Pa., Jan. 23, 2018 /PRNewswire/ -- RM ...",0.009906,-0.098405,0.03809,1


In [98]:
print("Rows:", len(final_df))
print("Unique tickers:", final_df["ticker"].nunique())
print("Label distribution:")
print(final_df["causal_label"].value_counts(normalize=True))


Rows: 36672
Unique tickers: 676
Label distribution:
causal_label
1    0.765407
0    0.234593
Name: proportion, dtype: float64


SAVE AS CSV

In [99]:
csv_path = "/kaggle/working/final_financial_causality_dataset.csv"

final_df.to_csv(csv_path, index=False)

print("Saved to:", csv_path)


Saved to: /kaggle/working/final_financial_causality_dataset.csv


(BEST FOR LARGE DATA): SAVE AS PARQUET

In [100]:
parquet_path = "/kaggle/working/final_financial_causality_dataset.parquet"

final_df.to_parquet(parquet_path, index=False)

print("Saved to:", parquet_path)


Saved to: /kaggle/working/final_financial_causality_dataset.parquet


SAVE TRAIN / TEST SPLIT

In [101]:
final_df = final_df.sort_values("date")

split_date = final_df["date"].quantile(0.8)

train_df = final_df[final_df["date"] <= split_date]
test_df  = final_df[final_df["date"] > split_date]

print("Train size:", len(train_df))
print("Test size:", len(test_df))


Train size: 29442
Test size: 7230
