In [1]:
from google.colab import drive
drive.mount('/content/drive')  # auth flow pops up

Mounted at /content/drive


In [2]:
news_file  = "/content/drive/MyDrive/MCD_headlines.csv"
price_file = "/content/drive/MyDrive/MCD_price.csv"

In [3]:
import pandas as pd
import numpy as np

# -------------------- Load files --------------------
news_df   = pd.read_csv(news_file)
prices_df = pd.read_csv(price_file)

# Rename columns
news_df = news_df.rename(columns={"Date": "date", "Article_title": "headline"})

# Convert to datetime
news_df['date'] = pd.to_datetime(news_df['date']).dt.tz_localize(None)
prices_df['date'] = pd.to_datetime(prices_df['date']).dt.tz_localize(None)

# Aggregate headlines per day
daily_news = (
    news_df.groupby("date")['headline']
    .apply(lambda x: " [SEP] ".join(x))
    .reset_index()
)

# Compute T+2 target and volatility
prices_df = prices_df.sort_values('date').reset_index(drop=True)
prices_df['close_t_plus_2'] = prices_df['close'].shift(-2)
prices_df['log_return'] = np.log(prices_df['close'] / prices_df['close'].shift(1))
prices_df['volatility_3d'] = prices_df['log_return'].rolling(3).std()

# Merge news and prices
dataset = pd.merge(
    daily_news,
    prices_df[['date', 'close', 'volatility_3d', 'close_t_plus_2']],
    on='date',
    how='inner'
)

# Keep last 5 years
max_date = dataset['date'].max()
min_date = max_date - pd.DateOffset(years=5)
dataset = dataset[dataset['date'] >= min_date].reset_index(drop=True)

# -------------------- Split train/test --------------------
TRAIN_RATIO = 0.8
split_index = int(len(dataset) * TRAIN_RATIO)

train_df = dataset.iloc[:split_index].reset_index(drop=True)
test_df  = dataset.iloc[split_index:].reset_index(drop=True)

# Save CSVs
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

print(f"✅ Train rows: {len(train_df)}, Test rows: {len(test_df)}")
print(dataset.head())


✅ Train rows: 524, Test rows: 132
        date                                           headline      close  \
0 2015-05-27  Stifel, Morningstar Analysts React To McDonald...  98.660004   
1 2015-05-28  Deutsche Bank Makes McDonald's A Top Pick [SEP...  96.480003   
2 2015-05-29  Look Out For The Restaurant Industry, Deutsche...  95.930000   
3 2015-06-03  Exclusive: Boston Market CEO Talks 'Chicken Wa...  96.519997   
4 2015-06-04  What To Expect From Tomorrow's Jobs Report [SE...  96.309998   

   volatility_3d  close_t_plus_2  
0       0.003769       95.930000  
1       0.012496       96.220001  
2       0.012453       96.290001  
3       0.001183       95.540001  
4       0.002310       95.320000  


In [4]:
dataset.info()
dataset.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            656 non-null    datetime64[ns]
 1   headline        656 non-null    object        
 2   close           656 non-null    float64       
 3   volatility_3d   656 non-null    float64       
 4   close_t_plus_2  656 non-null    float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 25.8+ KB


Unnamed: 0,date,close,volatility_3d,close_t_plus_2
count,656,656.0,656.0,656.0
mean,2017-10-26 07:10:14.634146304,150.909802,0.01145,150.939131
min,2015-05-27 00:00:00,93.470001,0.000459,91.209999
25%,2016-06-21 18:00:00,119.4275,0.004526,119.447498
50%,2017-07-20 12:00:00,153.255005,0.007442,153.979996
75%,2019-04-19 12:00:00,181.917496,0.012505,181.575001
max,2020-05-27 00:00:00,221.149994,0.126551,219.729996
std,,36.126993,0.01442,35.96509
