Here we will align the extracted data from money control with the stats of yahoo finance

In [27]:
import yfinance as yf

RelNS = yf.Ticker("Reliance.NS")

data = yf.download('Reliance.NS',period='1y', multi_level_index=False)

data.to_csv('Data/raw/yahoo/Reliance_yahoo.csv')


[*********************100%***********************]  1 of 1 completed


In [33]:
# Matching the dtype of dtype of datetime on both the dataframe

import pandas as pd

Reliance_news = pd.read_csv('Data/processed/Reliance.csv', parse_dates=['news_datetime'])

Reliance_news.dtypes

news_id                                str
headline                               str
link                                   str
source                                 str
raw_date_text                          str
news_datetime    datetime64[us, UTC+05:30]
article_text                       float64
scraped_at                             str
scrape_page                        float64
status                                 str
dtype: object

In [35]:
Reliance_yahoo = pd.read_csv('Data/raw/yahoo/Reliance_yahoo.csv', parse_dates=['Date'])

Reliance_yahoo.dtypes

Date      datetime64[us]
Close            float64
High             float64
Low              float64
Open             float64
Volume             int64
dtype: object

In [None]:
trial = Reliance_news['news_datetime'][1]
type(trial)

str

In [10]:
import pandas as pd
from datetime import time, timedelta, date

# ======================
# LOAD DATA
# ======================

news_path = r"Data\processed\extracted_news\Reliance.csv"
price_path = r"Data\raw\yahoo\Reliance_yahoo.csv"

news_df = pd.read_csv(news_path)
price_df = pd.read_csv(price_path)


# ======================
# DATETIME PROCESSING
# ======================

news_df["news_datetime"] = pd.to_datetime(news_df["news_datetime"])
price_df["Date"] = pd.to_datetime(price_df["Date"])

# normalize price date
price_df["trade_date"] = price_df["Date"].dt.date

# set index for fast lookup
price_df.set_index("trade_date", inplace=True)

# set of valid trading days
trading_days = set(price_df.index)


# ======================
# MARKET CLOSE TIME
# ======================

market_close = time(15, 30) # time - 15:30


# ======================
# FUNCTION: MAP NEWS → TRADING DAY
# ======================

# will find the price date with the corresponding news date
def get_event_date(news_dt):

    news_date = news_dt.date()
        
    # if news released after market close → shift to next day
    
    if news_dt.time() > market_close:
        print(news_dt.time())
        print(market_close)
        news_date += timedelta(days=1)

    # move forward until valid trading day
    while news_date not in trading_days:
        news_date += timedelta(days=1)

    return news_date


# ======================
# FUNCTION: GET FUTURE PRICE
# ======================

# will give the prices
def get_future_price(date, offset):

    d = date
    count = 0

    while count < offset:
        d += timedelta(days=1)
        if d in trading_days:
            count += 1

    return price_df.loc[d]["Close"]


# ======================
# ALIGNMENT LOOP
# ======================

records = []

count = 0
for _, row in news_df.iterrows():

    news_dt = row["news_datetime"]
    print(count)
    try:
        event_date = get_event_date(news_dt) # getting the price date of yahoo finance
    except ValueError:
        continue
    if event_date not in price_df.index:
        continue

    close_T = price_df.loc[event_date]["Close"]

    # future prices
    close_T1 = get_future_price(event_date, 1)
    close_T3 = get_future_price(event_date, 3)
    close_T5 = get_future_price(event_date, 5)

    # returns
    r1 = (close_T1 - close_T) / close_T
    r3 = (close_T3 - close_T) / close_T
    r5 = (close_T5 - close_T) / close_T

    records.append({
        "news_id": row["news_id"],
        "headline": row["headline"],
        "news_time": news_dt,
        "event_date": event_date,
        "close_T": close_T,
        "ret_1d": r1,
        "ret_3d": r3,
        "ret_5d": r5
    })
    count += 1

# ======================
# FINAL DATAFRAME
# ======================

aligned_df = pd.DataFrame(records)

print(aligned_df.head())

# save
aligned_df.to_csv("Data/processed/aligned/Reliance_aligned.csv", index=False)


0
1
18:17:00
15:30:00
2
18:46:00
15:30:00
3
21:05:00
15:30:00
4
5
6
7
8
9
21:52:00
15:30:00
10
21:11:00
15:30:00
11
21:53:00
15:30:00
12
13
20:52:00
15:30:00
14
19:46:00
15:30:00
15
16
17
18
19
20
21
22
23
24
25
22:37:00
15:30:00
26
18:17:00
15:30:00
27
28
29
30
31
32
33
34
22:07:00
15:30:00
35
21:48:00
15:30:00
36
22:24:00
15:30:00
37
38
39
40
41
42
43
19:08:00
15:30:00
44
17:57:00
15:30:00
45
17:08:00
15:30:00
46
18:38:00
15:30:00
47
17:10:00
15:30:00
48
16:35:00
15:30:00
49
16:40:00
15:30:00
50
51
52
53
54
55
56
57
17:15:00
15:30:00
58
59
60
61
19:32:00
15:30:00
62
22:48:00
15:30:00
63
22:42:00
15:30:00
64
22:21:00
15:30:00
65
22:03:00
15:30:00
66
23:04:00
15:30:00
67
68
15:37:00
15:30:00
69
70
71
72
15:35:00
15:30:00
73
15:40:00
15:30:00
74
75
76
77
78
79
16:32:00
15:30:00
80
81
16:55:00
15:30:00
82
83
84
85
21:21:00
15:30:00
86
16:27:00
15:30:00
87
88
89
90
16:19:00
15:30:00
91
92
93
23:15:00
15:30:00
94
22:54:00
15:30:00
95
22:23:00
15:30:00
96
22:43:00
15:30:00
97
98
99
100
101
