Here we will align the extracted data from money control with the stats of yahoo finance

In [8]:
import yfinance as yf

RelNS = yf.Ticker("Reliance.NS")

data = yf.download('Reliance.NS',period='5y', multi_level_index=False)

data.to_csv('Data/raw/yahoo/Reliance_yahoo.csv')


[*********************100%***********************]  1 of 1 completed


In [33]:
# Matching the dtype of dtype of datetime on both the dataframe

import pandas as pd

Reliance_news = pd.read_csv('Data/processed/Reliance.csv', parse_dates=['news_datetime'])

Reliance_news.dtypes

news_id                                str
headline                               str
link                                   str
source                                 str
raw_date_text                          str
news_datetime    datetime64[us, UTC+05:30]
article_text                       float64
scraped_at                             str
scrape_page                        float64
status                                 str
dtype: object

In [12]:
Reliance_yahoo = pd.read_csv('Data/raw/yahoo/Reliance_yahoo.csv', parse_dates=['Date'])

Reliance_yahoo.dtypes

Date      datetime64[us]
Close            float64
High             float64
Low              float64
Open             float64
Volume             int64
dtype: object

In [None]:
trial = Reliance_news['news_datetime'][1]
type(trial)

str

In [31]:
import pandas as pd
from datetime import time, timedelta, date

# ======================
# LOAD DATA
# ======================

news_path = r"Data\processed\extracted_news\Reliance.csv"
price_path = r"Data\raw\yahoo\Reliance_yahoo.csv"

news_df = pd.read_csv(news_path)
price_df = pd.read_csv(price_path)


# ======================
# DATETIME PROCESSING
# ======================
news_df["news_datetime"] = pd.to_datetime(news_df["news_datetime"])
latest_date = news_df["news_datetime"][0]
price_df["Date"] = pd.to_datetime(price_df["Date"])

# normalize price date
price_df["trade_date"] = price_df["Date"].dt.date

# set index for fast lookup
price_df.set_index("trade_date", inplace=True)

# set of valid trading days
trading_days = set(price_df.index)


# ======================
# MARKET CLOSE TIME
# ======================
market_close = time(15, 30) # time - 15:30


# ======================
# FUNCTION: MAP NEWS → TRADING DAY
# ======================

# will find the price date with the corresponding news date
def get_event_date(news_dt):

    news_date = news_dt.date()
        
    # if news released after market close → shift to next day
    # print(news_dt.date())
    if news_dt.time() > market_close:
        # print(news_dt.time())
        # print(market_close)
        news_date += timedelta(days=1)

    # move forward until valid trading day
    while news_date not in trading_days:
        news_date += timedelta(days=1)

    return news_date


# ======================
# FUNCTION: GET FUTURE PRICE
# ======================

# will give the prices
def get_future_price(date, offset):

    d = date
    count = 0

    while count < offset:
        d += timedelta(days=1)
        if d in trading_days:
            count += 1

    return price_df.loc[d]["Close"]


# ======================
# ALIGNMENT LOOP
# ======================

records = []
missed_urls = []
count = 0
for _, row in news_df.iterrows():

    news_dt = row["news_datetime"]
    # print(count)
    try:
        event_date = get_event_date(news_dt) # getting the price date of yahoo finance
    except ValueError:
        print("Skipping entry: returned None")
        missed_urls.append({"url" : row["link"],
                            "error" : "returned None"})
        continue
    if event_date not in price_df.index:
        print("Skipping entry: News date not in pride df")
        missed_urls.append({"url" : row["link"],
                           "error" : "news date not matched with price date"})
        continue

    # print(event_date)
    try:
        if latest_date.date() - event_date < timedelta(days=3):
            continue
    except AttributeError:
        print("Skipping entry: news_datetime is None")
        missed_urls.append(row["link"])
        continue 


    close_T = price_df.loc[event_date]["Close"]

    # future prices
    close_T1 = get_future_price(event_date, 1)
    close_T2 = get_future_price(event_date, 2)
    close_T3 = get_future_price(event_date, 3)

    # returns
    r1 = (close_T1 - close_T) / close_T
    r2 = (close_T2 - close_T) / close_T
    r3 = (close_T3 - close_T) / close_T

    records.append({
        "news_id": row["news_id"],
        "headline": row["headline"],
        "news_time": news_dt,
        "event_date": event_date,
        "close_T": close_T,
        "ret_1d": r1,
        "ret_2d": r2,
        "ret_3d": r3
    })
    count += 1

# ======================
# FINAL DATAFRAME
# ======================

aligned_df = pd.DataFrame(records)

print(aligned_df.head)

# save
aligned_df.to_csv("Data/processed/aligned/Reliance_aligned.csv", index=False)


Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
Skipping entry: returned None
<bound method NDFrame.head of                                   news_id  \
0    706cb401-2b3f-44f8-ab10-ae4f47c7feae   
1    42199003-0738-4a7b-9ca0-278b685e220f   
2    6105b445-a073-4677-8007-4249f99673e4   
3    5d1732d2-19ab-49c3-bf29-6cd360821066   
4    04429331-0de4-4a90-88af-56c80256f1d9   
..                                    ...   
728  40f1ba13-ad79-45a0-be46-5f5907bd57fd   
729  011e6705-3d28-4536-827d-99006e48ed87   
730  118a9472-56f4-415d-a2ab-886362f8df13   
731  910dad5d-889d-4369-9469-d0f1013da10e   
732  aa430ddc-a5bf-4cce-8827-ae23aa7f5c43   

         

In [23]:
news_df.dtypes

news_id                                str
headline                               str
link                                   str
source                                 str
raw_date_text                          str
news_datetime    datetime64[us, UTC+05:30]
article_text                       float64
scraped_at                             str
scrape_page                        float64
status                                 str
dtype: object

In [32]:
missed_urls

[{'url': 'https://www.moneycontrol.com/news/business/earnings/reliance-industries-ril-q3-results-2025-live-updates-ril-reliance-jio-q3-earnings-share-price-today-january-16-liveblog-12912085.html',
  'error': 'returned None'},
 {'url': 'https://www.moneycontrol.com/news/business/earnings/reliance-industries-ril-q2-results-2024-live-updates-hcl-tech-angel-one-jio-financial-q2-bonus-share-earnings-share-price-liveblog-12841383.html',
  'error': 'returned None'},
 {'url': 'https://www.moneycontrol.com/news/business/reliance-agm-2024-live-updates-ril-agm-mukesh-ambani-speech-announcements-jio-ipo-reliance-retail-liveblog-12809471.html',
  'error': 'returned None'},
 {'url': 'https://www.moneycontrol.com/news/business/earnings/q1-results-2024-live-hdfc-bank-wipro-paytm-reliance-bpcl-to-declare-earnings-today-liveblog-12772625.html',
  'error': 'returned None'},
 {'url': 'https://www.moneycontrol.com/news/business/earnings/reliance-q1-results-2024-today-live-updates-ril-q1-earnings-profit-re

In [36]:
import json

with open('logs/missed_urls.json','w', encoding='utf-8') as f:
    json.dump(missed_urls, f, indent=10, ensure_ascii=False)