In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import numpy as np

# Load datasets
df_price = pd.read_csv('/content/drive/MyDrive/MRP/price_features_engineered.csv', parse_dates=['date'])
df_news = pd.read_csv('/content/drive/MyDrive/MRP/news_preprocessed_features_engineered.csv', parse_dates=['date'])

# Convert both 'date' columns to timezone-naive (remove UTC)
df_price['date'] = df_price['date'].dt.tz_localize(None)
df_news['date'] = df_news['date'].dt.tz_localize(None)

# Merge on ['date', 'symbol']
df_master = pd.merge(
    df_price,
    df_news,
    on=['date', 'symbol'],
    how='left'  # Preserve all price data
)

In [3]:
df_price.head()

Unnamed: 0,date,symbol,open,high,low,adj close,log_volume,ma_10,vol_30,rsi_14
0,2016-02-16,A,36.720001,37.169998,36.400002,35.621273,14.881462,34.811432,0.020069,47.826113
1,2016-02-17,A,35.220001,38.130001,34.75,36.380188,15.498627,34.888284,0.020547,54.011715
2,2016-02-18,A,37.73,37.959999,37.09,35.726944,14.618185,34.888284,0.020747,55.421703
3,2016-02-19,A,36.98,37.599998,36.849998,35.967102,14.665405,34.890205,0.020778,48.788891
4,2016-02-22,A,37.880001,38.189999,37.779999,36.533894,14.399122,35.081377,0.019611,51.843788


In [4]:
df_news.head()

Unnamed: 0,date,symbol,day_of_week,had_news,avg_sentiment,avg_sentiment_confidence,sentiment_std_7,news_count,news_count_capped
0,2016-01-06,A,2,1,0.0,0.999295,0.0,2,2
1,2016-01-07,A,3,1,0.5,0.992147,0.353553,2,2
2,2016-02-05,A,4,1,0.0,0.997771,0.288675,1,1
3,2016-02-10,A,2,1,0.0,0.999134,0.25,1,1
4,2016-02-15,A,0,1,0.0,0.999934,0.223607,1,1


In [5]:
df_master.head(10)

Unnamed: 0,date,symbol,open,high,low,adj close,log_volume,ma_10,vol_30,rsi_14,day_of_week,had_news,avg_sentiment,avg_sentiment_confidence,sentiment_std_7,news_count,news_count_capped
0,2016-02-16,A,36.720001,37.169998,36.400002,35.621273,14.881462,34.811432,0.020069,47.826113,1.0,1.0,0.0,0.999966,0.204124,7.0,7.0
1,2016-02-17,A,35.220001,38.130001,34.75,36.380188,15.498627,34.888284,0.020547,54.011715,2.0,1.0,-0.333333,0.998626,0.243975,3.0,3.0
2,2016-02-18,A,37.73,37.959999,37.09,35.726944,14.618185,34.888284,0.020747,55.421703,,,,,,,
3,2016-02-19,A,36.98,37.599998,36.849998,35.967102,14.665405,34.890205,0.020778,48.788891,,,,,,,
4,2016-02-22,A,37.880001,38.189999,37.779999,36.533894,14.399122,35.081377,0.019611,51.843788,,,,,,,
5,2016-02-23,A,37.639999,37.959999,36.950001,35.707729,14.064167,35.309053,0.019943,50.528541,1.0,1.0,0.0,0.999717,0.243975,1.0,1.0
6,2016-02-24,A,36.650002,37.5,36.470001,36.005531,14.190448,35.511753,0.019785,51.502575,2.0,1.0,0.0,0.995523,0.125988,1.0,1.0
7,2016-02-25,A,37.619999,37.709999,36.959999,36.149635,14.189486,35.682751,0.01976,51.097212,,,,,,,
8,2016-02-26,A,37.75,37.919998,37.439999,36.111214,13.758094,35.899861,0.018656,59.416867,,,,,,,
9,2016-02-29,A,37.59,37.700001,37.330002,35.880642,14.59827,36.008415,0.018327,67.634884,0.0,1.0,0.0,0.960981,0.125988,1.0,1.0


In [6]:
# Sort before forward-filling
df_master.sort_values(by=['symbol', 'date'], inplace=True)

# Fill missing news features

# Identify all sentiment-related columns
sentiment_cols = [
    'avg_sentiment',
    'avg_sentiment_confidence',
    'had_news',
    'sentiment_std_7',
    'day_of_week'
]

# Forward-fill missing values within each symbol group
df_master[sentiment_cols] = df_master.groupby('symbol')[sentiment_cols].ffill()

# Fill remaining NaNs with defaults (for symbols that never had news early on)
default_fill = {
    'avg_sentiment_score': 0.0,
    'avg_sentiment_confidence': 0.0,
    'had_news': 0.0,
    'sentiment_std_7': 0.0,
    'day_of_week': -1
}
df_master.fillna(value=default_fill, inplace=True)

In [7]:
df_master.head()

Unnamed: 0,date,symbol,open,high,low,adj close,log_volume,ma_10,vol_30,rsi_14,day_of_week,had_news,avg_sentiment,avg_sentiment_confidence,sentiment_std_7,news_count,news_count_capped
0,2016-02-16,A,36.720001,37.169998,36.400002,35.621273,14.881462,34.811432,0.020069,47.826113,1.0,1.0,0.0,0.999966,0.204124,7.0,7.0
1,2016-02-17,A,35.220001,38.130001,34.75,36.380188,15.498627,34.888284,0.020547,54.011715,2.0,1.0,-0.333333,0.998626,0.243975,3.0,3.0
2,2016-02-18,A,37.73,37.959999,37.09,35.726944,14.618185,34.888284,0.020747,55.421703,2.0,1.0,-0.333333,0.998626,0.243975,,
3,2016-02-19,A,36.98,37.599998,36.849998,35.967102,14.665405,34.890205,0.020778,48.788891,2.0,1.0,-0.333333,0.998626,0.243975,,
4,2016-02-22,A,37.880001,38.189999,37.779999,36.533894,14.399122,35.081377,0.019611,51.843788,2.0,1.0,-0.333333,0.998626,0.243975,,


In [8]:
df_master.shape

(10851560, 17)

In [9]:
# Save updated merged file
df_master.to_csv('/content/drive/MyDrive/MRP/merged_price_news_data_ffilled.csv', index=False)
print("Final merged dataset with forward-filled sentiment saved.")

Final merged dataset with forward-filled sentiment saved.
